reorder files, in preparation for Bullet 3 -> Bullet 2 merge

This commit is contained in:
erwincoumans 2013-04-29 19:04:08 -07:00
parent 55b69201a9
commit 3ac332f3a7
162 changed files with 215 additions and 3070 deletions

View File

@ -1,7 +1,7 @@
#include "GpuDemo.h"
#include "GpuDemoInternalData.h"
#include "Bullet3Common/b3Scalar.h"
#include "basic_initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "OpenGLWindow/ShapeData.h"
#include "OpenGLWindow/GLInstancingRenderer.h"

View File

@ -1,7 +1,7 @@
#ifndef GPU_DEMO_INTERNAL_DATA_H
#define GPU_DEMO_INTERNAL_DATA_H
#include "basic_initialize/b3OpenCLInclude.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
struct GpuDemoInternalData
{

View File

@ -2,7 +2,7 @@
#include "OpenGLWindow/GLInstancingRenderer.h"
#include "OpenGLWindow/ShapeData.h"
#include "basic_initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#define MSTRINGIFY(A) #A
static char* particleKernelsString =
@ -12,10 +12,10 @@ static char* particleKernelsString =
#include "Bullet3Common/b3Vector3.h"
#include "OpenGLWindow/OpenGLInclude.h"
#include "OpenGLWindow/GLInstanceRendererInternalData.h"
#include "parallel_primitives/host/b3LauncherCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
//#include "../../opencl/primitives/AdlPrimitives/Math/Math.h"
//#include "../../opencl/broadphase_benchmark/b3GridBroadphaseCL.h"
#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
#include "GpuDemoInternalData.h"

View File

@ -4,12 +4,12 @@
#include "OpenGLWindow/GLInstancingRenderer.h"
#include "Bullet3Common/b3Quaternion.h"
#include "OpenGLWindow/b3gWindowInterface.h"
#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
#include "../GpuDemoInternalData.h"
#include "basic_initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "OpenGLWindow/OpenGLInclude.h"
#include "OpenGLWindow/GLInstanceRendererInternalData.h"
#include "parallel_primitives/host/b3LauncherCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
static b3KeyboardCallback oldCallback = 0;
extern bool gReset;

View File

@ -4,7 +4,7 @@ function createProject(vendor)
if (hasCL) then
project ("Bullet3_OpenCL_gpu_demo_" .. vendor)
project ("App_Bullet3_OpenCL_Demos_" .. vendor)
initOpenCL(vendor)
@ -20,8 +20,7 @@ function createProject(vendor)
includedirs {
"..",
"../../src",
"../../btgui",
"../../opencl"
"../../btgui"
}
links {
@ -30,21 +29,23 @@ function createProject(vendor)
"Bullet3Geometry",
"Bullet3Collision",
"Bullet3Dynamics",
"Bullet2FileLoader"
"Bullet2FileLoader",
"Bullet3OpenCL_" .. vendor
}
files {
"**.cpp",
"**.h",
"../ObjLoader/string_extra.cpp",
"../ObjLoader/string_extra.h",
"../ObjLoader/objLoader.cpp",
"../ObjLoader/objLoader.h",
"../ObjLoader/obj_parser.cpp",
"../ObjLoader/obj_parser.h",
"../ObjLoader/list.cpp",
"../ObjLoader/list.h",
"../Wavefront/string_extra.cpp",
"../Wavefront/string_extra.h",
"../Wavefront/objLoader.cpp",
"../Wavefront/objLoader.h",
"../Wavefront/obj_parser.cpp",
"../Wavefront/obj_parser.h",
"../Wavefront/list.cpp",
"../Wavefront/list.h",
"../../btgui/OpenGLWindow/GLInstancingRenderer.cpp",
@ -60,21 +61,6 @@ function createProject(vendor)
"../../btgui/OpenGLTrueTypeFont/opengl_fontstashcallbacks.cpp",
"../../btgui/OpenGLTrueTypeFont/opengl_fontstashcallbacks.h",
"../../btgui/FontFiles/OpenSans.cpp",
"../../opencl/basic_initialize/b3OpenCLUtils.cpp",
"../../opencl/basic_initialize/b3OpenCLUtils.h",
"../../opencl/gpu_broadphase/host/b3GpuSapBroadphase.cpp",
"../../opencl/gpu_narrowphase/host/**.cpp",
"../../opencl/gpu_narrowphase/host/**.h",
"../../opencl/parallel_primitives/host/b3BoundSearchCL.cpp",
"../../opencl/parallel_primitives/host/b3BoundSearchCL.h",
"../../opencl/parallel_primitives/host/b3FillCL.cpp",
"../../opencl/parallel_primitives/host/b3FillCL.h",
"../../opencl/parallel_primitives/host/b3PrefixScanCL.cpp",
"../../opencl/parallel_primitives/host/b3PrefixScanCL.h",
"../../opencl/parallel_primitives/host/b3RadixSort32CL.cpp",
"../../opencl/parallel_primitives/host/b3RadixSort32CL.h",
"../../opencl/gpu_rigidbody/host/**.cpp",
"../../opencl/gpu_rigidbody/host/**.h",
}

View File

@ -22,11 +22,11 @@ extern bool enableExperimentalCpuConcaveCollision;
//#include "LinearMath/b3Quickprof.h"
#include "Bullet3Common/b3Quaternion.h"
#include "Bullet3Common/b3Matrix3x3.h"
#include "gpu_narrowphase/host/b3ConvexUtility.h"
#include "Bullet3OpenCL/NarrowphaseCollision/b3ConvexUtility.h"
#include "OpenGLWindow/ShapeData.h"
#include "../../ObjLoader/objLoader.h"
#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
#include "../../Wavefront/objLoader.h"
#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"
///work-in-progress
///This ReadBulletSample is kept as simple as possible without dependencies to the Bullet SDK.

View File

@ -6,17 +6,17 @@
#include "OpenGLWindow/GLInstancingRenderer.h"
#include "Bullet3Common/b3Quaternion.h"
#include "OpenGLWindow/b3gWindowInterface.h"
#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
#include "../GpuDemoInternalData.h"
#include "basic_initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "OpenGLWindow/OpenGLInclude.h"
#include "OpenGLWindow/GLInstanceRendererInternalData.h"
#include "parallel_primitives/host/b3LauncherCL.h"
#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
#include "gpu_rigidbody/host/b3Config.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"
#include "Bullet3OpenCL/RigidBody/b3Config.h"
#include "GpuRigidBodyDemoInternalData.h"
#include"../../ObjLoader/objLoader.h"
#include"../../Wavefront/objLoader.h"
#include "Bullet3Common/b3Transform.h"
#include "OpenGLWindow/GLInstanceGraphicsShape.h"

View File

@ -6,15 +6,15 @@
#include "OpenGLWindow/GLInstancingRenderer.h"
#include "Bullet3Common/b3Quaternion.h"
#include "OpenGLWindow/b3gWindowInterface.h"
#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
#include "../GpuDemoInternalData.h"
#include "basic_initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "OpenGLWindow/OpenGLInclude.h"
#include "OpenGLWindow/GLInstanceRendererInternalData.h"
#include "parallel_primitives/host/b3LauncherCL.h"
#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
#include "gpu_rigidbody/host/b3Config.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"
#include "Bullet3OpenCL/RigidBody/b3Config.h"
#include "GpuRigidBodyDemoInternalData.h"
#include "Bullet3Common/b3Transform.h"

View File

@ -6,15 +6,15 @@
#include "OpenGLWindow/GLInstancingRenderer.h"
#include "Bullet3Common/b3Quaternion.h"
#include "OpenGLWindow/b3gWindowInterface.h"
#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
#include "../GpuDemoInternalData.h"
#include "basic_initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "OpenGLWindow/OpenGLInclude.h"
#include "OpenGLWindow/GLInstanceRendererInternalData.h"
#include "parallel_primitives/host/b3LauncherCL.h"
#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
#include "gpu_rigidbody/host/b3Config.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"
#include "Bullet3OpenCL/RigidBody/b3Config.h"
#include "GpuRigidBodyDemoInternalData.h"
#include "../gwenUserInterface.h"
#include "Bullet3Dynamics/ConstraintSolver/b3Point2PointConstraint.h"

View File

@ -4,15 +4,15 @@
#include "OpenGLWindow/GLInstancingRenderer.h"
#include "Bullet3Common/b3Quaternion.h"
#include "OpenGLWindow/b3gWindowInterface.h"
#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
#include "../GpuDemoInternalData.h"
#include "basic_initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "OpenGLWindow/OpenGLInclude.h"
#include "OpenGLWindow/GLInstanceRendererInternalData.h"
#include "parallel_primitives/host/b3LauncherCL.h"
#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
#include "gpu_rigidbody/host/b3Config.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"
#include "Bullet3OpenCL/RigidBody/b3Config.h"
#include "GpuRigidBodyDemoInternalData.h"
#include "Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.h"

View File

@ -1,8 +1,8 @@
#ifndef GPU_RIGIDBODY_INTERNAL_DATA_H
#define GPU_RIGIDBODY_INTERNAL_DATA_H
#include "basic_initialize/b3OpenCLUtils.h"
#include "parallel_primitives/host/b3OpenCLArray.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3Common/b3Vector3.h"
struct GpuRigidBodyDemoInternalData

View File

@ -2,19 +2,18 @@
#include "GpuRigidBodyDemo.h"
#include "Bullet3Common/b3Quickprof.h"
#include "OpenGLWindow/ShapeData.h"
#include "OpenGLWindow/GLInstancingRenderer.h"
#include "Bullet3Common/b3Quaternion.h"
#include "OpenGLWindow/b3gWindowInterface.h"
#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
#include "../GpuDemoInternalData.h"
#include "basic_initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "OpenGLWindow/OpenGLInclude.h"
#include "OpenGLWindow/GLInstanceRendererInternalData.h"
#include "parallel_primitives/host/b3LauncherCL.h"
#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
#include "gpu_rigidbody/host/b3Config.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"
#include "Bullet3OpenCL/RigidBody/b3Config.h"
#include "GpuRigidBodyDemoInternalData.h"
#include "../gwenUserInterface.h"

View File

@ -1,5 +1,5 @@
project "Gwen_OpenGLTest"
project "Test_Gwen_OpenGL"
kind "ConsoleApp"
flags {"Unicode"}

View File

@ -91,30 +91,35 @@
if not _OPTIONS["ios"] then
include "../demo/gpudemo"
include "../btgui/MidiTest"
-- include "../demo/gpudemo"
-- include "../btgui/MidiTest"
-- include "../opencl/vector_add_simplified"
-- include "../opencl/vector_add"
include "../opencl/basic_initialize"
include "../demo/gpu_initialize"
include "../opencl/parallel_primitives/host"
include "../opencl/parallel_primitives/test"
include "../opencl/parallel_primitives/benchmark"
include "../opencl/lds_bank_conflict"
-- include "../opencl/reduce"
-- include "../opencl/gpu_broadphase/test"
-- include "../opencl/gpu_narrowphase/test"
include "../btgui/Gwen"
include "../btgui/GwenOpenGLTest"
include "../btgui/OpenGLTrueTypeFont"
-- include "../btgui/OpenGLWindow"
-- include "../demo/ObjLoader"
include "../test/OpenCL/BasicInitialize"
include "../test/OpenCL/BroadphaseCollision"
include "../test/OpenCL/NarrowphaseCollision"
include "../test/OpenCL/ParallelPrimitives"
include "../src/Bullet3Dynamics"
include "../src/Bullet3Common"
include "../src/Bullet3Geometry"
include "../src/Bullet3Collision"
include "../src/Bullet3Serialize/Bullet2FileLoader"
include "../src/Bullet3OpenCL"
include "../Demos3/GpuDemos"
-- include "../demo/gpu_initialize"
-- include "../opencl/lds_bank_conflict"
-- include "../opencl/reduce"
-- include "../btgui/OpenGLTrueTypeFont"
-- include "../btgui/OpenGLWindow"
-- include "../demo/ObjLoader"
include "../test/b3DynamicBvhBroadphase"
-- include "../test/b3DynamicBvhBroadphase"
end

View File

@ -1,28 +0,0 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("OpenCL_intialize_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../bin"
files {
"main.cpp",
"b3OpenCLUtils.cpp",
"b3OpenCLUtils.h"
}
end
end
createProject("Apple")
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")

View File

@ -1,129 +0,0 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include <stdio.h>
#include "../basic_initialize/b3OpenCLUtils.h"
#include "../host/b3GpuSapBroadphase.h"
#include "Bullet3Common/b3Vector3.h"
#include "parallel_primitives/host/b3FillCL.h"
#include "parallel_primitives/host/b3BoundSearchCL.h"
#include "parallel_primitives/host/b3RadixSort32CL.h"
#include "parallel_primitives/host/b3PrefixScanCL.h"
#include "Bullet3Common/b3CommandLineArgs.h"
#include "Bullet3Common/b3MinMax.h"
int g_nPassed = 0;
int g_nFailed = 0;
bool g_testFailed = 0;
#define TEST_INIT g_testFailed = 0;
#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
cl_context g_context=0;
cl_device_id g_device=0;
cl_command_queue g_queue =0;
const char* g_deviceName = 0;
void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
{
void* glCtx=0;
void* glDC = 0;
int ciErrNum = 0;
//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
g_context = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numDev = b3OpenCLUtils::getNumDevices(g_context);
if (numDev>0)
{
b3OpenCLDeviceInfo info;
g_device= b3OpenCLUtils::getDevice(g_context,0);
g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
b3OpenCLUtils::printDeviceInfo(g_device);
b3OpenCLUtils::getDeviceInfo(g_device,&info);
g_deviceName = info.m_deviceName;
}
}
void exitCL()
{
clReleaseCommandQueue(g_queue);
clReleaseContext(g_context);
}
inline void broadphaseTest()
{
TEST_INIT;
b3GpuSapBroadphase* sap = new b3GpuSapBroadphase(g_context,g_device,g_queue);
int group=1;
int mask=1;
b3Vector3 aabbMin(0,0,0);
b3Vector3 aabbMax(1,1,1);
int usrPtr = 1;
sap->createProxy(aabbMin,aabbMax,usrPtr,group,mask);
aabbMin.setValue(1,1,1);
aabbMax.setValue(2,2,2);
usrPtr = 2;
sap->createProxy(aabbMin,aabbMax,usrPtr,group,mask);
sap->writeAabbsToGpu();
sap->calculateOverlappingPairs();
int numOverlap = sap->getNumOverlap();
cl_mem buf = sap->getOverlappingPairBuffer();
TEST_ASSERT(numOverlap==1);
delete sap;
TEST_REPORT( "broadphaseTest" );
}
int main(int argc, char** argv)
{
int preferredDeviceIndex = -1;
int preferredPlatformIndex = -1;
b3CommandLineArgs args(argc, argv);
args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
args.GetCmdLineArgument("platformId", preferredPlatformIndex);
initCL(preferredDeviceIndex,preferredPlatformIndex);
broadphaseTest();
printf("%d tests passed\n",g_nPassed, g_nFailed);
if (g_nFailed)
{
printf("%d tests failed\n",g_nFailed);
}
printf("End, press <enter>\n");
getchar();
exitCL();
}

View File

@ -1,46 +0,0 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("OpenCL_broadphase_test_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
includedirs {"..","../..","../../../src"}
files {
"main.cpp",
"../../basic_initialize/b3OpenCLInclude.h",
"../../basic_initialize/b3OpenCLUtils.cpp",
"../../basic_initialize/b3OpenCLUtils.h",
"../host/b3GpuSapBroadphase.cpp",
"../host/b3GpuSapBroadphase.h",
"../../parallel_primitives/host/btFillCL.cpp",
"../../parallel_primitives/host/btFillCL.h",
"../../parallel_primitives/host/btBoundSearchCL.cpp",
"../../parallel_primitives/host/btBoundSearchCL.h",
"../../parallel_primitives/host/btPrefixScanCL.cpp",
"../../parallel_primitives/host/btPrefixScanCL.h",
"../../parallel_primitives/host/btRadixSort32CL.cpp",
"../../parallel_primitives/host/btRadixSort32CL.h",
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
"../../../src/Bullet3Common/b3AlignedAllocator.h",
"../../../src/Bullet3Common/b3AlignedObjectArray.h",
"../../../src/Bullet3Common/b3Quickprof.cpp",
"../../../src/Bullet3Common/b3Quickprof.h",
}
end
end
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")
createProject("Apple")

View File

@ -1,111 +0,0 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include <stdio.h>
#include "../basic_initialize/b3OpenCLUtils.h"
#include "../host/b3ConvexHullContact.h"
#include "Bullet3Common/b3Vector3.h"
#include "parallel_primitives/host/b3FillCL.h"
#include "parallel_primitives/host/b3BoundSearchCL.h"
#include "parallel_primitives/host/b3RadixSort32CL.h"
#include "parallel_primitives/host/b3PrefixScanCL.h"
#include "Bullet3Common/b3CommandLineArgs.h"
#include "../host/b3ConvexHullContact.h"
#include "Bullet3Common/b3MinMax.h"
int g_nPassed = 0;
int g_nFailed = 0;
bool g_testFailed = 0;
#define TEST_INIT g_testFailed = 0;
#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
cl_context g_context=0;
cl_device_id g_device=0;
cl_command_queue g_queue =0;
const char* g_deviceName = 0;
void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
{
void* glCtx=0;
void* glDC = 0;
int ciErrNum = 0;
//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
g_context = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numDev = b3OpenCLUtils::getNumDevices(g_context);
if (numDev>0)
{
b3OpenCLDeviceInfo info;
g_device= b3OpenCLUtils::getDevice(g_context,0);
g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
b3OpenCLUtils::printDeviceInfo(g_device);
b3OpenCLUtils::getDeviceInfo(g_device,&info);
g_deviceName = info.m_deviceName;
}
}
void exitCL()
{
clReleaseCommandQueue(g_queue);
clReleaseContext(g_context);
}
inline void gpuConvexHullContactTest()
{
TEST_INIT;
TEST_ASSERT(1);
GpuSatCollision* sat = new GpuSatCollision(g_context,g_device,g_queue);
delete sat;
TEST_REPORT( "gpuConvexHullContactTest" );
}
int main(int argc, char** argv)
{
int preferredDeviceIndex = -1; int preferredPlatformIndex = -1;
b3CommandLineArgs args(argc, argv);
args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
args.GetCmdLineArgument("platformId", preferredPlatformIndex);
initCL(preferredDeviceIndex,preferredPlatformIndex);
gpuConvexHullContactTest();
printf("%d tests passed\n",g_nPassed, g_nFailed);
if (g_nFailed)
{
printf("%d tests failed\n",g_nFailed);
}
printf("End, press <enter>\n");
getchar();
exitCL();
}

View File

@ -1,49 +0,0 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("OpenCL_sat_test_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
includedirs {"..","../..","../../../src"}
files {
"main.cpp",
"../../basic_initialize/b3OpenCLInclude.h",
"../../basic_initialize/b3OpenCLUtils.cpp",
"../../basic_initialize/b3OpenCLUtils.h",
"../host/**.cpp",
"../host/**.h",
"../../parallel_primitives/host/btFillCL.cpp",
"../../parallel_primitives/host/btFillCL.h",
"../../parallel_primitives/host/btBoundSearchCL.cpp",
"../../parallel_primitives/host/btBoundSearchCL.h",
"../../parallel_primitives/host/btPrefixScanCL.cpp",
"../../parallel_primitives/host/btPrefixScanCL.h",
"../../parallel_primitives/host/btRadixSort32CL.cpp",
"../../parallel_primitives/host/btRadixSort32CL.h",
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
"../../../src/Bullet3Common/b3AlignedAllocator.h",
"../../../src/Bullet3Common/b3AlignedObjectArray.h",
"../../../src/Bullet3Common/b3Quickprof.cpp",
"../../../src/Bullet3Common/b3Quickprof.h",
"../../../src/Bullet3Geometry/**.cpp",
"../../../src/Bullet3Geometry/**.h",
}
end
end
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")
createProject("Apple")

View File

@ -1,171 +0,0 @@
#define TILE_DIM 32
#define BLOCK_ROWS 8
/*// simple copy kernel (CUDA)
// Used as reference case representing best effective bandwidth.
__global__ void copy(float *odata, const float *idata)
{
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
odata[(y+j)*width + x] = idata[(y+j)*width + x];
}
*/
// simple copy kernel (OpenCL)
__kernel void copyKernel(__global float* odata, __global const float* idata)
{
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
int width = get_num_groups(0) * get_local_size(0);
for (int j = 0; j < get_num_groups(1); j+= get_local_size(1))
{
odata[(y+j)*width + x] = idata[(y+j)*width + x];
}
}
/*
// copy kernel using shared memory (CUDA)
// Also used as reference case, demonstrating effect of using shared memory.
__global__ void copySharedMem(float *odata, const float *idata)
{
__shared__ float tile[TILE_DIM * TILE_DIM];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x] = idata[(y+j)*width + x];
__syncthreads();
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x];
}
*/
// copy kernel using shared memory (OpenCL)
// Also used as reference case, demonstrating effect of using shared memory.
__kernel void copySharedMemKernel(__global float *odata, __global const float *idata)
{
__local float tile[TILE_DIM * TILE_DIM];
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
int width = get_num_groups(0) * get_local_size(0);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[(get_local_id(1)+j)*TILE_DIM + get_local_id(0)] = idata[(y+j)*width + x];
barrier(CLK_LOCAL_MEM_FENCE);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[(get_local_id(1)+j)*TILE_DIM + get_local_id(0)];
}
/*
// naive transpose (CUDA)
// Simplest transpose; doesn't use shared memory.
// Global memory reads are coalesced but writes are not.
__global__ void transposeNaive(float *odata, const float *idata)
{
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
odata[x*width + (y+j)] = idata[(y+j)*width + x];
}
*/
// naive transpose (OpenCL)
// Simplest transpose; doesn't use shared memory.
// Global memory reads are coalesced but writes are not.
__kernel void transposeNaiveKernel(__global float *odata, __global const float *idata)
{
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
int width = get_num_groups(0) * get_local_size(0);
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
odata[x*width + (y+j)] = idata[(y+j)*width + x];
}
/*
// coalesced transpose (CUDA)
// Uses shared memory to achieve coalesing in both reads and writes
// Tile width == #banks causes shared memory bank conflicts.
__global__ void transposeCoalesced(float *odata, const float *idata)
{
__shared__ float tile[TILE_DIM][TILE_DIM];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
__syncthreads();
x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset
y = blockIdx.x * TILE_DIM + threadIdx.y;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
}
*/
// coalesced transpose (OpenCL)
// Uses shared memory to achieve coalesing in both reads and writes
// Tile width == #banks causes shared memory bank conflicts.
__kernel void transposeCoalescedKernel(__global float *odata, __global const float *idata)
{
__local float tile[TILE_DIM][TILE_DIM];
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
int width = get_num_groups(0) * get_local_size(0);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[get_local_id(1)+j][get_local_id(0)] = idata[(y+j)*width + x];
barrier(CLK_LOCAL_MEM_FENCE);
x = get_group_id(1) * TILE_DIM + get_local_id(0);
y = get_group_id(0) * TILE_DIM + get_local_id(1);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[get_local_id(0)][get_local_id(1) + j];
}
// No bank-conflict transpose (OpenCL)
// Same as transposeCoalesced except the first tile dimension is padded
// to avoid shared memory bank conflicts.
__kernel void transposeNoBankConflictsKernel(__global float *odata, __global const float *idata)
{
__local float tile[TILE_DIM][TILE_DIM+1];
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
int width = get_num_groups(0) * get_local_size(0);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[get_local_id(1)+j][get_local_id(0)] = idata[(y+j)*width + x];
barrier(CLK_LOCAL_MEM_FENCE);
x = get_group_id(1) * TILE_DIM + get_local_id(0);
y = get_group_id(0) * TILE_DIM + get_local_id(1);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[get_local_id(0)][get_local_id(1) + j];
}

View File

@ -1,361 +0,0 @@
//Adapted from CUDA to OpenCL by Erwin Coumans
//See http://bitbucket.org/erwincoumans/opencl_course
// Copyright 2012 NVIDIA Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "b3OpenCLUtils.h"
#include "../parallel_primitives/host/b3OpenCLArray.h"
#include "../parallel_primitives/host/b3LauncherCL.h"
#include "Bullet3Common/b3Quickprof.h"
#include "../parallel_primitives/host/b3FillCL.h"
#include "Bullet3Common/b3CommandLineArgs.h"
#include <string.h>
#include <stdio.h>
#include <assert.h>
//make sure to update the same #define in the opencl/lds_bank_conflict/lds_kernels.cl
const int TILE_DIM = 32;
const int BLOCK_ROWS = 8;
const int NUM_REPS = 100;
// Check errors and print GB/s
void postprocess(const float *ref, const float *res, int n, float ms)
{
bool passed = true;
for (int i = 0; i < n; i++)
if (res[i] != ref[i]) {
printf("\nError: at res[%d] got %f but expected %f\n", i, res[i], ref[i]);
printf("%25s\n", "*** FAILED ***");
passed = false;
break;
}
if (passed)
printf("%20.2f\n", 2 * n * sizeof(float) * 1e-6 * NUM_REPS / ms );
}
char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
{
// locals
FILE* pFileStream = NULL;
size_t szSourceLength;
// open the OpenCL source code file
pFileStream = fopen(cFilename, "rb");
if(pFileStream == 0)
{
return NULL;
}
size_t szPreambleLength = strlen(cPreamble);
// get the length of the source code
fseek(pFileStream, 0, SEEK_END);
szSourceLength = ftell(pFileStream);
fseek(pFileStream, 0, SEEK_SET);
// allocate a buffer for the source code string and read it in
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
memcpy(cSourceString, cPreamble, szPreambleLength);
fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream);
// close the file and return the total length of the combined (preamble + source) string
fclose(pFileStream);
if(szFinalLength != 0)
{
*szFinalLength = szSourceLength + szPreambleLength;
}
cSourceString[szSourceLength + szPreambleLength] = '\0';
return cSourceString;
}
int main(int argc, char **argv)
{
printf("Use --deviceId=<id> or --platformId=<id> to override OpenCL device\n");
b3CommandLineArgs args(argc,argv);
const int nx = 1024;
const int ny = 1024;
const int mem_size = nx*ny*sizeof(float);
const int num_elements = nx*ny;
b3Clock clock;
double startEvent=0.f;
double stopEvent=0.f;
int localSizeX = TILE_DIM;
int localSizeY = BLOCK_ROWS;
int numThreadsX = (nx/TILE_DIM)*TILE_DIM;
int numThreadsY = (ny/TILE_DIM)*BLOCK_ROWS;
int gridX = numThreadsX / localSizeX;
int gridY = numThreadsY / localSizeY;
int ciErrNum = 0;
int preferred_device = -1;
int preferred_platform = -1;
args.GetCmdLineArgument("deviceId",preferred_device);
args.GetCmdLineArgument("platformId",preferred_platform);
cl_platform_id platformId=0;
cl_context ctx=0;
cl_command_queue queue=0;
cl_device_id device=0;
cl_kernel copyKernel=0;
cl_kernel copySharedMemKernel=0;
cl_kernel transposeNaiveKernel = 0;
cl_kernel transposeCoalescedKernel = 0;
cl_kernel transposeNoBankConflictsKernel= 0;
ctx = b3OpenCLUtils::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
b3OpenCLUtils::printPlatformInfo(platformId);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
device = b3OpenCLUtils::getDevice(ctx,0);
b3OpenCLUtils::printDeviceInfo(device);
queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
const char* cSourceFile = "opencl/lds_bank_conflict/lds_kernels.cl";
size_t szKernelLength;
const char* cSourceCL =0;
char relativeFileName[1024];
{
const char* prefix[]={"./","../","../../","../../../","../../../../"};
int numPrefixes = sizeof(prefix)/sizeof(char*);
for (int i=0;!cSourceCL && i<numPrefixes;i++)
{
sprintf(relativeFileName,"%s%s",prefix[i],cSourceFile);
cSourceCL = loadProgSource(relativeFileName, "", &szKernelLength);
if (cSourceCL)
{
printf("Loaded program source: %s\n", relativeFileName);
}
}
}
if (!cSourceCL)
{
printf("Couldn't find file %s, exiting\n",cSourceFile);
exit(0);
}
char flags[1024]={0};
#ifdef CL_PLATFORM_INTEL
///use this flag to allow for OpenCL kernel debugging on CPU using the Intel OpenCL run-time
//sprintf(flags,"-g -s \"%s\"","C:/develop/opencl_course/opencl/lds_bank_conflict/lds_kernels.cl");
#endif//CL_PLATFORM_INTEL
copyKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"copyKernel",&ciErrNum,0,flags);
copySharedMemKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"copySharedMemKernel",&ciErrNum,0,flags);
transposeNaiveKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeNaiveKernel",&ciErrNum,0,flags);
transposeCoalescedKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeCoalescedKernel",&ciErrNum,0,flags);
transposeNoBankConflictsKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeNoBankConflictsKernel",&ciErrNum,0,flags);
b3FillCL clMemSet(ctx,device,queue);
printf("\n============================================\n");
printf("Matrix size: %d %d, Block size: %d %d, Tile size: %d %d\n",
nx, ny, TILE_DIM, BLOCK_ROWS, TILE_DIM, TILE_DIM);
float *h_idata = (float*)malloc(mem_size);
float *h_cdata = (float*)malloc(mem_size);
float *h_tdata = (float*)malloc(mem_size);
float *gold = (float*)malloc(mem_size);
b3OpenCLArray<float> d_idataCL(ctx,queue);d_idataCL.resize(num_elements);
b3OpenCLArray<float> d_cdataCL(ctx,queue);d_cdataCL.resize(num_elements);
b3OpenCLArray<float> d_tdataCL(ctx,queue);d_tdataCL.resize(num_elements);
// check parameters and calculate execution configuration
if (nx % TILE_DIM || ny % TILE_DIM)
{
printf("nx and ny must be a multiple of TILE_DIM\n");
goto error_exit;
}
if (TILE_DIM % BLOCK_ROWS)
{
printf("TILE_DIM must be a multiple of BLOCK_ROWS\n");
goto error_exit;
}
// host
for (int j = 0; j < ny; j++)
for (int i = 0; i < nx; i++)
h_idata[j*nx + i] = j*nx + i;
// correct result for error checking
for (int j = 0; j < ny; j++)
for (int i = 0; i < nx; i++)
{
gold[j*nx + i] = h_idata[i*nx + j];
}
d_idataCL.copyFromHostPointer(h_idata,num_elements);
// events for timing
clock.reset();
float ms;
// ------------
// time kernels
// ------------
printf("%25s%25s\n", "Routine", "Bandwidth (GB/s)");
// ----
// copy
// ----
printf("%25s", "copy");
clMemSet.execute(d_cdataCL,0.f,num_elements);
{
// warm up
b3LauncherCL launcher( queue, copyKernel);
launcher.setBuffer( d_cdataCL.getBufferCL());
launcher.setBuffer( d_idataCL.getBufferCL());
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
startEvent = clock.getTimeMicroseconds()/1e3;
for (int i = 0; i < NUM_REPS; i++)
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
oclCHECKERROR(ciErrNum, CL_SUCCESS);
clFinish(queue);
stopEvent = clock.getTimeMicroseconds()/1e3;
}
ms = float(stopEvent-startEvent);
d_cdataCL.copyToHostPointer(h_cdata,num_elements,0);
postprocess(h_idata, h_cdata, nx*ny, ms);
// -------------
// copySharedMem
// -------------
printf("%25s", "shared memory copy");
clMemSet.execute(d_cdataCL,0.f,num_elements);
{
b3LauncherCL launcher( queue, copySharedMemKernel);
launcher.setBuffer( d_cdataCL.getBufferCL());
launcher.setBuffer( d_idataCL.getBufferCL());
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
startEvent = clock.getTimeMicroseconds()/1e3;
for (int i = 0; i < NUM_REPS; i++)
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
oclCHECKERROR(ciErrNum, CL_SUCCESS);
clFinish(queue);
stopEvent = clock.getTimeMicroseconds()/1e3;
}
ms = float(stopEvent-startEvent);
d_cdataCL.copyToHostPointer(h_cdata,num_elements,0);
postprocess(h_idata, h_cdata, nx * ny, ms);
// --------------
// transposeNaive
// --------------
printf("%25s", "naive transpose");
clMemSet.execute(d_tdataCL,0.f,num_elements);
{
// warmup
b3LauncherCL launcher( queue, transposeNaiveKernel);
launcher.setBuffer( d_tdataCL.getBufferCL());
launcher.setBuffer( d_idataCL.getBufferCL());
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
startEvent = clock.getTimeMicroseconds()/1e3;
for (int i = 0; i < NUM_REPS; i++)
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
oclCHECKERROR(ciErrNum, CL_SUCCESS);
clFinish(queue);
stopEvent = clock.getTimeMicroseconds()/1e3;
}
ms = float(stopEvent-startEvent);
d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
postprocess(gold, h_tdata, nx * ny, ms);
// ------------------
// transposeCoalesced
// ------------------
printf("%25s", "coalesced transpose");
clMemSet.execute(d_tdataCL,0.f,num_elements);
{
b3LauncherCL launcher( queue, transposeCoalescedKernel);
launcher.setBuffer( d_tdataCL.getBufferCL());
launcher.setBuffer( d_idataCL.getBufferCL());
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
startEvent = clock.getTimeMicroseconds()/1e3;
for (int i = 0; i < NUM_REPS; i++)
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
oclCHECKERROR(ciErrNum, CL_SUCCESS);
clFinish(queue);
stopEvent = clock.getTimeMicroseconds()/1e3;
}
ms = float(stopEvent-startEvent);
d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
postprocess(gold, h_tdata, nx * ny, ms);
// ------------------------
// transposeNoBankConflicts
// ------------------------
printf("%25s", "conflict-free transpose");
clMemSet.execute(d_tdataCL,0.f,num_elements);
{
b3LauncherCL launcher( queue, transposeNoBankConflictsKernel);
launcher.setBuffer( d_tdataCL.getBufferCL());
launcher.setBuffer( d_idataCL.getBufferCL());
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
startEvent = clock.getTimeMicroseconds()/1e3;
for (int i = 0; i < NUM_REPS; i++)
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
oclCHECKERROR(ciErrNum, CL_SUCCESS);
clFinish(queue);
stopEvent = clock.getTimeMicroseconds()/1e3;
}
ms = float(stopEvent-startEvent);
d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
postprocess(gold, h_tdata, nx * ny, ms);
error_exit:
// cleanup
clReleaseKernel(copyKernel);
clReleaseCommandQueue(queue);
clReleaseContext(ctx);
free(h_idata);
free(h_tdata);
free(h_cdata);
free(gold);
printf("Press <enter>\n");
getchar();
}

View File

@ -1,44 +0,0 @@
function createProject (vendor)
local hasCL = findOpenCL(vendor)
if (hasCL) then
project ( "OpenCL_lds_bank_conflict_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../bin"
links {
"OpenCL_lib_parallel_primitives_host_" .. vendor
}
includedirs {
"../basic_initialize",
"../../src"
}
files {
"main.cpp",
"../basic_initialize/b3OpenCLUtils.cpp",
"../basic_initialize/b3OpenCLUtils.h",
"../../src/Bullet3Common/b3AlignedAllocator.cpp",
"../../src/Bullet3Common/b3AlignedAllocator.h",
"../../src/Bullet3Common/b3AlignedObjectArray.h",
"../../src/Bullet3Common/b3Quickprof.cpp",
"../../src/Bullet3Common/b3Quickprof.h",
}
end
end
createProject("AMD")
createProject("NVIDIA")
createProject("Intel")
createProject("Apple")

View File

@ -1,40 +0,0 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("OpenCL_radixsort_benchmark_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
includedirs {"..","../../../src"}
links {
("OpenCL_lib_parallel_primitives_host_" .. vendor)
}
files {
"test_large_problem_sorting.cpp",
"../../basic_initialize/b3OpenCLUtils.cpp",
"../../basic_initialize/b3OpenCLUtils.h",
"../host/b3FillCL.cpp",
"../host/b3PrefixScanCL.cpp",
"../host/b3RadixSort32CL.cpp",
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
"../../../src/Bullet3Common/b3AlignedAllocator.h",
"../../../src/Bullet3Common/b3AlignedObjectArray.h",
"../../../src/Bullet3Common/b3Quickprof.cpp",
"../../../src/Bullet3Common/b3Quickprof.h",
}
end
end
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")
createProject("Apple")

View File

@ -1,711 +0,0 @@
/******************************************************************************
* Copyright 2010 Duane Merrill
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*
*
*
* AUTHORS' REQUEST:
*
* If you use|reference|benchmark this code, please cite our Technical
* Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
*
* @TechReport{ Merrill:Sorting:2010,
* author = "Duane Merrill and Andrew Grimshaw",
* title = "Revisiting Sorting for GPGPU Stream Architectures",
* year = "2010",
* institution = "University of Virginia, Department of Computer Science",
* address = "Charlottesville, VA, USA",
* number = "CS2010-03"
* }
*
* For more information, see our Google Code project site:
* http://code.google.com/p/back40computing/
*
* Thanks!
******************************************************************************/
/******************************************************************************
* Simple test driver program for *large-problem* radix sorting.
*
* Useful for demonstrating how to integrate radix sorting into
* your application
******************************************************************************/
/******************************************************************************
* Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans
******************************************************************************/
#ifdef _WIN32
#pragma warning (disable:4996)
#endif
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>
#include <algorithm>
#include <string>
//#include <iostream>
#include <sstream>
/**********************
*
*/
#include "../host/b3RadixSort32CL.h"
#include "../../basic_initialize/b3OpenCLUtils.h"
#include "Bullet3Common/b3Quickprof.h"
cl_context g_cxMainContext;
cl_device_id g_device;
cl_command_queue g_cqCommandQueue;
/***********************
*
*/
bool g_verbose;
///Preferred OpenCL device/platform. When < 0 then no preference is used.
///Note that b3OpenCLUtils might still use the preference of using a platform vendor that matches the SDK vendor used to build the application.
///Preferred device/platform take priority over this platform-vendor match
int gPreferredDeviceId = -1;
int gPreferredPlatformId = -1;
/******************************************************************************
* Routines
******************************************************************************/
/**
* Keys-only sorting. Uses the GPU to sort the specified vector of elements for the given
* number of iterations, displaying runtime information.
*
* @param[in] num_elements
* Size in elements of the vector to sort
* @param[in] h_keys
* Vector of keys to sort
* @param[in] iterations
* Number of times to invoke the GPU sorting primitive
* @param[in] cfg
* Config
*/
template <typename K>
void TimedSort(
unsigned int num_elements,
K *h_keys,
unsigned int iterations)
{
printf("Keys only, %d iterations, %d elements\n", iterations, num_elements);
int max_elements = num_elements;
b3AlignedObjectArray<unsigned int> hostData;
hostData.resize(num_elements);
for (int i=0;i<num_elements;i++)
{
hostData[i] = h_keys[i];
}
b3RadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
b3OpenCLArray<unsigned int> gpuData(g_cxMainContext,g_cqCommandQueue);
gpuData.copyFromHost(hostData);
//sorter.executeHost(gpuData);
sorter.execute(gpuData);
b3AlignedObjectArray<unsigned int> hostDataSorted;
gpuData.copyToHost(hostDataSorted);
clFinish(g_cqCommandQueue);
{
//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
// Create sorting enactor
// Perform the timed number of sorting iterations
double elapsed = 0;
float duration = 0;
b3Clock watch;
//warm-start
gpuData.copyFromHost(hostData);
clFinish(g_cqCommandQueue);
sorter.execute(gpuData);
watch.reset();
for (int i = 0; i < iterations; i++)
{
// Move a fresh copy of the problem into device storage
gpuData.copyFromHost(hostData);
clFinish(g_cqCommandQueue);
// Start GPU timing record
double startMs = watch.getTimeMicroseconds()/1e3;
// Call the sorting API routine
sorter.execute(gpuData);
clFinish(g_cqCommandQueue);
double stopMs = watch.getTimeMicroseconds()/1e3;
duration = stopMs - startMs;
// End GPU timing record
elapsed += (double) duration;
printf("duration = %f\n", duration);
}
// Display timing information
double avg_runtime = elapsed / iterations;
// double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
// printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
double throughput = ((double) num_elements) / avg_runtime / 1000.0 ;
printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
gpuData.copyToHost(hostData);
for (int i=0;i<num_elements;i++)
{
h_keys[i] = hostData[i];
}
}
}
/**
* Key-value sorting. Uses the GPU to sort the specified vector of elements for the given
* number of iterations, displaying runtime information.
*
* @param[in] num_elements
* Size in elements of the vector to sort
* @param[in] h_keys
* Vector of keys to sort
* @param[in,out] h_values
* Vector of values to sort
* @param[in] iterations
* Number of times to invoke the GPU sorting primitive
* @param[in] cfg
* Config
*/
template <typename K, typename V>
void TimedSort(
unsigned int num_elements,
K *h_keys,
V *h_values,
unsigned int iterations)
{
printf("Key-values, %d iterations, %d elements\n", iterations, num_elements);
int max_elements = num_elements;
b3AlignedObjectArray<b3SortData> hostData;
hostData.resize(num_elements);
for (int i=0;i<num_elements;i++)
{
hostData[i].m_key = h_keys[i];
hostData[i].m_value = h_values[i];
}
b3RadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
b3OpenCLArray<b3SortData> gpuData(g_cxMainContext,g_cqCommandQueue);
gpuData.copyFromHost(hostData);
//sorter.executeHost(gpuData);
sorter.execute(gpuData);
b3AlignedObjectArray<b3SortData> hostDataSorted;
gpuData.copyToHost(hostDataSorted);
#if 0
for (int i=0;i<num_elements;i++)
{
printf("hostData[%d].m_key = %d\n",i, hostDataSorted[i].m_key);
printf("hostData[%d].m_value = %d\n",i,hostDataSorted[i].m_value);
}
#endif
clFinish(g_cqCommandQueue);
{
//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
// Create sorting enactor
// Perform the timed number of sorting iterations
double elapsed = 0;
float duration = 0;
b3Clock watch;
//warm-start
gpuData.copyFromHost(hostData);
sorter.execute(gpuData);
clFinish(g_cqCommandQueue);
watch.reset();
for (int i = 0; i < iterations; i++)
{
// Move a fresh copy of the problem into device storage
gpuData.copyFromHost(hostData);
clFinish(g_cqCommandQueue);
// Start GPU timing record
double startMs = watch.getTimeMicroseconds()/1e3;
// Call the sorting API routine
sorter.execute(gpuData);
clFinish(g_cqCommandQueue);
double stopMs = watch.getTimeMicroseconds()/1e3;
duration = stopMs - startMs;
// End GPU timing record
elapsed += (double) duration;
printf("duration = %f\n", duration);
}
// Display timing information
double avg_runtime = elapsed / iterations;
// double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
// printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
double throughput = ((double) num_elements) / avg_runtime / 1000.0 ;
printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
gpuData.copyToHost(hostData);
for (int i=0;i<num_elements;i++)
{
h_keys[i] = hostData[i].m_key;
h_values[i] = hostData[i].m_value;
}
}
}
/**
* Generates random 32-bit keys.
*
* We always take the second-order byte from rand() because the higher-order
* bits returned by rand() are commonly considered more uniformly distributed
* than the lower-order bits.
*
* We can decrease the entropy level of keys by adopting the technique
* of Thearling and Smith in which keys are computed from the bitwise AND of
* multiple random samples:
*
* entropy_reduction | Effectively-unique bits per key
* -----------------------------------------------------
* -1 | 0
* 0 | 32
* 1 | 25.95
* 2 | 17.41
* 3 | 10.78
* 4 | 6.42
* ... | ...
*
*/
template <typename K>
void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8)
{
const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char);
unsigned char key_bits[NUM_UCHARS];
do {
for (int j = 0; j < NUM_UCHARS; j++) {
unsigned char quarterword = 0xff;
for (int i = 0; i <= entropy_reduction; i++) {
quarterword &= (rand() >> 7);
}
key_bits[j] = quarterword;
}
if (lower_key_bits < sizeof(K) * 8) {
unsigned long long base = 0;
memcpy(&base, key_bits, sizeof(K));
base &= (1 << lower_key_bits) - 1;
memcpy(key_bits, &base, sizeof(K));
}
memcpy(&key, key_bits, sizeof(K));
} while (key != key); // avoids NaNs when generating random floating point numbers
}
/******************************************************************************
* Templated routines for printing keys/values to the console
******************************************************************************/
template<typename T>
void PrintValue(T val) {
printf("%d", val);
}
template<>
void PrintValue<float>(float val) {
printf("%f", val);
}
template<>
void PrintValue<double>(double val) {
printf("%f", val);
}
template<>
void PrintValue<unsigned char>(unsigned char val) {
printf("%u", val);
}
template<>
void PrintValue<unsigned short>(unsigned short val) {
printf("%u", val);
}
template<>
void PrintValue<unsigned int>(unsigned int val) {
printf("%u", val);
}
template<>
void PrintValue<long>(long val) {
printf("%ld", val);
}
template<>
void PrintValue<unsigned long>(unsigned long val) {
printf("%lu", val);
}
template<>
void PrintValue<long long>(long long val) {
printf("%lld", val);
}
template<>
void PrintValue<unsigned long long>(unsigned long long val) {
printf("%llu", val);
}
/**
* Compares the equivalence of two arrays
*/
template <typename T, typename SizeT>
int CompareResults(T* computed, T* reference, SizeT len, bool verbose = true)
{
printf("\n");
for (SizeT i = 0; i < len; i++) {
if (computed[i] != reference[i]) {
printf("INCORRECT: [%lu]: ", (unsigned long) i);
PrintValue<T>(computed[i]);
printf(" != ");
PrintValue<T>(reference[i]);
if (verbose) {
printf("\nresult[...");
for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
PrintValue<T>(computed[j]);
printf(", ");
}
printf("...]");
printf("\nreference[...");
for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
PrintValue<T>(reference[j]);
printf(", ");
}
printf("...]");
}
return 1;
}
}
printf("CORRECT\n");
return 0;
}
/**
* Creates an example sorting problem whose keys is a vector of the specified
* number of K elements, values of V elements, and then dispatches the problem
* to the GPU for the given number of iterations, displaying runtime information.
*
* @param[in] iterations
* Number of times to invoke the GPU sorting primitive
* @param[in] num_elements
* Size in elements of the vector to sort
* @param[in] cfg
* Config
*/
template<typename K, typename V>
void TestSort(
unsigned int iterations,
int num_elements,
bool keys_only)
{
// Allocate the sorting problem on the host and fill the keys with random bytes
K *h_keys = NULL;
K *h_reference_keys = NULL;
V *h_values = NULL;
h_keys = (K*) malloc(num_elements * sizeof(K));
h_reference_keys = (K*) malloc(num_elements * sizeof(K));
if (!keys_only) h_values = (V*) malloc(num_elements * sizeof(V));
// Use random bits
for (unsigned int i = 0; i < num_elements; ++i) {
RandomBits<K>(h_keys[i], 0);
//h_keys[i] = num_elements-i;
//h_keys[i] = 0xffffffffu-i;
if (!keys_only)
h_values[i] = h_keys[i];//0xffffffffu-i;
h_reference_keys[i] = h_keys[i];
}
// Run the timing test
if (keys_only) {
TimedSort<K>(num_elements, h_keys, iterations);
} else {
TimedSort<K, V>(num_elements, h_keys, h_values, iterations);
}
// cudaThreadSynchronize();
// Display sorted key data
if (g_verbose) {
printf("\n\nKeys:\n");
for (int i = 0; i < num_elements; i++) {
PrintValue<K>(h_keys[i]);
printf(", ");
}
printf("\n\n");
}
// Verify solution
std::sort(h_reference_keys, h_reference_keys + num_elements);
CompareResults<K>(h_keys, h_reference_keys, num_elements, true);
printf("\n");
fflush(stdout);
// Free our allocated host memory
if (h_keys != NULL) free(h_keys);
if (h_values != NULL) free(h_values);
}
/**
* Displays the commandline usage for this tool
*/
void Usage()
{
printf("\ntest_large_problem_sorting [--device=<device index>] [--v] [--i=<num-iterations>] [--n=<num-elements>] [--key-values] [--deviceId=<int>] [--platformId=<int>]\n");
printf("\n");
printf("\t--v\tDisplays sorted results to the console.\n");
printf("\n");
printf("\t--i\tPerforms the sorting operation <num-iterations> times\n");
printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n");
printf("\n");
printf("\t--n\tThe number of elements to comprise the sample problem\n");
printf("\t\t\tDefault = 512\n");
printf("\n");
printf("\t--key-values\tSpecifies that keys are accommodated by value pairings\n");
printf("\n");
}
/******************************************************************************
* Command-line parsing
******************************************************************************/
#include <map>
#include <algorithm>
#include <string>
class b3CommandLineArgs
{
protected:
std::map<std::string, std::string> pairs;
public:
// Constructor
b3CommandLineArgs(int argc, char **argv)
{
using namespace std;
for (int i = 1; i < argc; i++)
{
string arg = argv[i];
if ((arg[0] != '-') || (arg[1] != '-')) {
continue;
}
string::size_type pos;
string key, val;
if ((pos = arg.find( '=')) == string::npos) {
key = string(arg, 2, arg.length() - 2);
val = "";
} else {
key = string(arg, 2, pos - 2);
val = string(arg, pos + 1, arg.length() - 1);
}
pairs[key] = val;
}
}
bool CheckCmdLineFlag(const char* arg_name)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
return true;
}
return false;
}
template <typename T>
void GetCmdLineArgument(const char *arg_name, T &val);
int ParsedArgc()
{
return pairs.size();
}
};
template <typename T>
void b3CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
istringstream strstream(itr->second);
strstream >> val;
}
}
template <>
void b3CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
string s = itr->second;
val = (char*) malloc(sizeof(char) * (s.length() + 1));
strcpy(val, s.c_str());
} else {
val = NULL;
}
}
/******************************************************************************
* Main
******************************************************************************/
extern bool gDebugSkipLoadingBinary;
int main( int argc, char** argv)
{
//gDebugSkipLoadingBinary = true;
cl_int ciErrNum;
b3CommandLineArgs args(argc,argv);
args.GetCmdLineArgument("deviceId", gPreferredDeviceId);
args.GetCmdLineArgument("platformId", gPreferredPlatformId);
printf("Initialize OpenCL using b3OpenCLUtils_createContextFromType\n");
cl_platform_id platformId;
g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
// g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numDev = b3OpenCLUtils_getNumDevices(g_cxMainContext);
if (!numDev)
{
printf("error: no OpenCL devices\n");
exit(0);
}
int result;
int devId = 0;
g_device = b3OpenCLUtils_getDevice(g_cxMainContext,devId);
b3OpenCLUtils_printDeviceInfo(g_device);
// create a command-queue
g_cqCommandQueue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
//srand(time(NULL));
srand(0); // presently deterministic
unsigned int num_elements = 8*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
unsigned int iterations = 10;
bool keys_only = true;
//
// Check command line arguments
//
if (args.CheckCmdLineFlag("help"))
{
Usage();
return 0;
}
args.GetCmdLineArgument("i", iterations);
args.GetCmdLineArgument("n", num_elements);
keys_only = !args.CheckCmdLineFlag("key-values");
g_verbose = args.CheckCmdLineFlag("v");
TestSort<unsigned int, unsigned int>(
iterations,
num_elements,
keys_only);
}

View File

@ -1,35 +0,0 @@
#ifndef B3_INT2_H
#define B3_INT2_H
struct b3UnsignedInt2
{
union
{
struct
{
unsigned int x,y;
};
struct
{
unsigned int s[2];
};
};
};
struct b3Int2
{
union
{
struct
{
int x,y;
};
struct
{
int s[2];
};
};
};
#endif

View File

@ -1,379 +0,0 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include <stdio.h>
#include "../basic_initialize/b3OpenCLUtils.h"
#include "../host/b3FillCL.h"
#include "../host/b3BoundSearchCL.h"
#include "../host/b3RadixSort32CL.h"
#include "../host/b3PrefixScanCL.h"
#include "Bullet3Common/b3CommandLineArgs.h"
#include "Bullet3Common/b3MinMax.h"
int g_nPassed = 0;
int g_nFailed = 0;
bool g_testFailed = 0;
#define TEST_INIT g_testFailed = 0;
#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
cl_context g_context=0;
cl_device_id g_device=0;
cl_command_queue g_queue =0;
const char* g_deviceName = 0;
void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
{
void* glCtx=0;
void* glDC = 0;
int ciErrNum = 0;
//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
g_context = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numDev = b3OpenCLUtils::getNumDevices(g_context);
if (numDev>0)
{
b3OpenCLDeviceInfo info;
g_device= b3OpenCLUtils::getDevice(g_context,0);
g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
b3OpenCLUtils::printDeviceInfo(g_device);
b3OpenCLUtils::getDeviceInfo(g_device,&info);
g_deviceName = info.m_deviceName;
}
}
void exitCL()
{
clReleaseCommandQueue(g_queue);
clReleaseContext(g_context);
}
inline void fillIntTest()
{
TEST_INIT;
b3FillCL* fillCL = new b3FillCL(g_context,g_device,g_queue);
int maxSize=1024*256;
b3OpenCLArray<int> intBuffer(g_context,g_queue,maxSize);
intBuffer.resize(maxSize);
#define NUM_TESTS 7
int dx = maxSize/NUM_TESTS;
for (int iter=0;iter<NUM_TESTS;iter++)
{
int size = b3Min( 11+dx*iter, maxSize );
int value = 2;
int offset=0;
fillCL->execute(intBuffer,value,size,offset);
b3AlignedObjectArray<int> hostBuf2;
hostBuf2.resize(size);
fillCL->executeHost(hostBuf2,value,size,offset);
b3AlignedObjectArray<int> hostBuf;
intBuffer.copyToHost(hostBuf);
for(int i=0; i<size; i++)
{
TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
}
}
delete fillCL;
TEST_REPORT( "fillIntTest" );
}
__inline
void seedRandom(int seed)
{
srand( seed );
}
template<typename T>
__inline
T getRandom(const T& minV, const T& maxV)
{
float r = (rand()%10000)/10000.f;
T range = maxV - minV;
return (T)(minV + r*range);
}
struct b3SortDataCompare
{
inline bool operator()(const b3SortData& first, const b3SortData& second) const
{
return (first.m_key < second.m_key) || (first.m_key==second.m_key && first.m_value < second.m_value);
}
};
void boundSearchTest( )
{
TEST_INIT;
int maxSize = 1024*256;
int bucketSize = 256;
b3OpenCLArray<b3SortData> srcCL(g_context,g_queue,maxSize);
b3OpenCLArray<unsigned int> upperCL(g_context,g_queue,maxSize);
b3OpenCLArray<unsigned int> lowerCL(g_context,g_queue,maxSize);
b3AlignedObjectArray<b3SortData> srcHost;
b3AlignedObjectArray<unsigned int> upperHost;
b3AlignedObjectArray<unsigned int> lowerHost;
b3AlignedObjectArray<unsigned int> upperHostCompare;
b3AlignedObjectArray<unsigned int> lowerHostCompare;
b3BoundSearchCL* search = new b3BoundSearchCL(g_context,g_device,g_queue, maxSize);
int dx = maxSize/NUM_TESTS;
for(int iter=0; iter<NUM_TESTS; iter++)
{
int size = b3Min( 128+dx*iter, maxSize );
upperHost.resize(bucketSize);
lowerHost.resize(bucketSize);
upperHostCompare.resize(bucketSize);
lowerHostCompare.resize(bucketSize);
srcHost.resize(size);
for(int i=0; i<size; i++)
{
b3SortData v;
// v.m_key = i<2? 0 : 5;
v.m_key = getRandom(0,bucketSize);
v.m_value = i;
srcHost.at(i) = v;
}
srcHost.quickSort(b3SortDataCompare());
srcCL.copyFromHost(srcHost);
{
for(int i=0; i<bucketSize; i++)
{
lowerHost[i] = -1;
lowerHostCompare[i] = -1;
upperHost[i] = -1;
upperHostCompare[i] = -1;
}
upperCL.copyFromHost(upperHost);
lowerCL.copyFromHost(lowerHost);
}
search->execute(srcCL,size,upperCL,bucketSize,b3BoundSearchCL::BOUND_UPPER);
search->execute(srcCL,size,lowerCL,bucketSize,b3BoundSearchCL::BOUND_LOWER);
search->executeHost(srcHost,size,upperHostCompare,bucketSize,b3BoundSearchCL::BOUND_UPPER);
search->executeHost(srcHost,size,lowerHostCompare,bucketSize,b3BoundSearchCL::BOUND_LOWER);
lowerCL.copyToHost(lowerHost);
upperCL.copyToHost(upperHost);
for(int i=0; i<bucketSize; i++)
{
TEST_ASSERT(upperHostCompare[i] == upperHost[i]);
TEST_ASSERT(lowerHostCompare[i] == lowerHost[i]);
}
/*
for(int i=1; i<bucketSize; i++)
{
int lhi_1 = lowerHost[i-1];
int lhi = lowerHost[i];
for(int j=lhi_1; j<lhi; j++)
//for(int j=lowerHost[i-1]; j<lowerHost[i]; j++)
{
TEST_ASSERT( srcHost[j].m_key < i );
}
}
for(int i=0; i<bucketSize; i++)
{
int jMin = (i==0)?0:upperHost[i-1];
for(int j=jMin; j<upperHost[i]; j++)
{
TEST_ASSERT( srcHost[j].m_key <= i );
}
}
*/
for(int i=0; i<bucketSize; i++)
{
int lhi = lowerHost[i];
int uhi = upperHost[i];
for(int j=lhi; j<uhi; j++)
{
if ( srcHost[j].m_key != i )
{
printf("error %d != %d\n",srcHost[j].m_key,i);
}
TEST_ASSERT( srcHost[j].m_key == i );
}
}
}
delete search;
TEST_REPORT( "boundSearchTest" );
}
void prefixScanTest()
{
TEST_INIT;
int maxSize = 1024*256;
b3AlignedObjectArray<unsigned int> buf0Host;
b3AlignedObjectArray<unsigned int> buf1Host;
b3OpenCLArray<unsigned int> buf2CL(g_context,g_queue,maxSize);
b3OpenCLArray<unsigned int> buf3CL(g_context,g_queue,maxSize);
b3PrefixScanCL* scan = new b3PrefixScanCL(g_context,g_device,g_queue,maxSize);
int dx = maxSize/NUM_TESTS;
for(int iter=0; iter<NUM_TESTS; iter++)
{
int size = b3Min( 128+dx*iter, maxSize );
buf0Host.resize(size);
buf1Host.resize(size);
for(int i=0; i<size; i++)
buf0Host[i] = 1;
buf2CL.copyFromHost( buf0Host);
unsigned int sumHost, sumGPU;
scan->executeHost(buf0Host, buf1Host, size, &sumHost );
scan->execute( buf2CL, buf3CL, size, &sumGPU );
buf3CL.copyToHost(buf0Host);
TEST_ASSERT( sumHost == sumGPU );
for(int i=0; i<size; i++)
TEST_ASSERT( buf1Host[i] == buf0Host[i] );
}
delete scan;
TEST_REPORT( "scanTest" );
}
bool radixSortTest()
{
TEST_INIT;
int maxSize = 1024*256;
b3AlignedObjectArray<b3SortData> buf0Host;
buf0Host.resize(maxSize);
b3AlignedObjectArray<b3SortData> buf1Host;
buf1Host.resize(maxSize );
b3OpenCLArray<b3SortData> buf2CL(g_context,g_queue,maxSize);
b3RadixSort32CL* sort = new b3RadixSort32CL(g_context,g_device,g_queue,maxSize);
int dx = maxSize/NUM_TESTS;
for(int iter=0; iter<NUM_TESTS; iter++)
{
int size = b3Min( 128+dx*iter, maxSize-512 );
size = NEXTMULTIPLEOF( size, 512 );//not necessary
buf0Host.resize(size);
for(int i=0; i<size; i++)
{
b3SortData v;
v.m_key = getRandom(0,0xff);
v.m_value = i;
buf0Host[i] = v;
}
buf2CL.copyFromHost( buf0Host);
sort->executeHost( buf0Host);
sort->execute(buf2CL);
buf2CL.copyToHost(buf1Host);
for(int i=0; i<size; i++)
{
TEST_ASSERT( buf0Host[i].m_value == buf1Host[i].m_value && buf0Host[i].m_key == buf1Host[i].m_key );
}
}
delete sort;
TEST_REPORT( "radixSort" );
return g_testFailed;
}
int main(int argc, char** argv)
{
int preferredDeviceIndex = -1;
int preferredPlatformIndex = -1;
b3CommandLineArgs args(argc, argv);
args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
args.GetCmdLineArgument("platformId", preferredPlatformIndex);
initCL(preferredDeviceIndex,preferredPlatformIndex);
fillIntTest();
boundSearchTest();
prefixScanTest();
radixSortTest();
exitCL();
printf("%d tests passed, %d tests failed\n",g_nPassed, g_nFailed);
printf("End, press <enter>\n");
getchar();
}

View File

@ -1,41 +0,0 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("OpenCL_primitives_test_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
includedirs {".","..","../../../src"}
files {
"main.cpp",
"../../basic_initialize/b3OpenCLInclude.h",
"../../basic_initialize/b3OpenCLUtils.cpp",
"../../basic_initialize/b3OpenCLUtils.h",
"../host/b3FillCL.cpp",
"../host/b3FillCL.h",
"../host/b3BoundSearchCL.cpp",
"../host/b3BoundSearchCL.h",
"../host/b3PrefixScanCL.cpp",
"../host/b3PrefixScanCL.h",
"../host/b3RadixSort32CL.cpp",
"../host/b3RadixSort32CL.h",
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
"../../../src/Bullet3Common/b3AlignedAllocator.h",
"../../../src/Bullet3Common/b3AlignedObjectArray.h",
}
end
end
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")
createProject("Apple")

View File

@ -1,116 +0,0 @@
///original author: Erwin Coumans
#include "b3OpenCLUtils.h"
#include "../parallel_primitives/host/b3OpenCLArray.h"
#include "../parallel_primitives/host/b3LauncherCL.h"
#include <stdio.h>
#define MSTRINGIFY(A) #A
const char* kernelString= MSTRINGIFY(
__kernel void ReduceGlobal(__global int* d_in, __global int* d_out, int numElements)
{
int myId = get_global_id(0);
int tid = get_local_id(0);
int ls = get_local_size(0);
for (unsigned int s=ls/2;s>0;s>>=1)
{
if (myId<numElements)
{
if (tid<s)
{
d_in[myId] += d_in[myId+s];
}
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
if (tid==0)
{
if (myId<numElements)
{
d_out[get_group_id(0)]=d_in[myId];
}
}
}
);
int main(int argc, char* argv[])
{
int ciErrNum = 0;
int preferred_device = -1;
int preferred_platform = -1;
cl_platform_id platformId;
cl_context ctx;
cl_command_queue queue;
cl_device_id device;
cl_kernel addKernel;
ctx = b3OpenCLUtils::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
b3OpenCLUtils::printPlatformInfo(platformId);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
if (!ctx) {
printf("No OpenCL capable GPU found!");
return 0;
}
device = b3OpenCLUtils::getDevice(ctx,0);
queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
addKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,kernelString,"ReduceGlobal",&ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numElements = 1024*1024;
b3OpenCLArray<int> a(ctx,queue);
b3OpenCLArray<int> b(ctx,queue);
b3AlignedObjectArray<int> hostA;
b3AlignedObjectArray<int> hostB;
for (int i=0;i<numElements;i++)
{
hostA.push_back(1);
hostB.push_back(0.f);
}
a.copyFromHost(hostA);
b.copyFromHost(hostB);
int hostSum= 0;
for (int i=0;i<numElements;i++)
{
hostSum += hostA.at(i);
}
b.resize(numElements);
{
b3LauncherCL launcher( queue, addKernel);
launcher.setBuffer( a.getBufferCL());
launcher.setBuffer( b.getBufferCL());
launcher.setConst( numElements );
launcher.launch1D( numElements,1024);
}
clFinish(queue);
{
b3LauncherCL launcher( queue, addKernel);
launcher.setBuffer( b.getBufferCL());
launcher.setBuffer( a.getBufferCL());
launcher.setConst( 1024 );
launcher.launch1D( 1024,1024);
}
clFinish(queue);
printf("hostSum = %d\n", hostSum);
int clSum = a.at(0);
printf("clSum = %d\n", clSum );
if (hostSum != clSum)
{
printf("Incorrect result\n");
} else
{
printf("Correct result\n");
}
clReleaseCommandQueue(queue);
clReleaseContext(ctx);
printf("press key\n");
getchar();
return 0;
}

View File

@ -1,41 +0,0 @@
function createProject (vendor)
local hasCL = findOpenCL(vendor)
if (hasCL) then
project ( "OpenCL_reduce_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../bin"
links {
"OpenCL_lib_parallel_primitives_host_" .. vendor
}
includedirs {
"../basic_initialize",
"../../src"
}
files {
"main.cpp",
"../basic_initialize/b3OpenCLUtils.cpp",
"../basic_initialize/b3OpenCLUtils.h",
"../../src/Bullet3Common/b3AlignedAllocator.cpp",
"../../src/Bullet3Common/b3AlignedAllocator.h",
"../../src/Bullet3Common/b3AlignedObjectArray.h",
}
end
end
createProject("AMD")
createProject("NVIDIA")
createProject("Intel")
createProject("Apple")

View File

@ -1,16 +0,0 @@
__kernel void VectorAdd(__global const float8* a, __global const float8* b, __global float8* c, int numElements)
{
// get oct-float index into global data array
int iGID = get_global_id(0);
if (iGID>=numElements)
return;
float8 aGID = a[iGID];
float8 bGID = b[iGID];
float8 result = aGID + bGID;
// write back out to GMEM
c[iGID] = result;
}

View File

@ -1,20 +0,0 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* vectorAddCL= \
"\n"
"\n"
"__kernel void VectorAdd(__global const float8* a, __global const float8* b, __global float8* c, int numElements)\n"
"{\n"
" // get oct-float index into global data array\n"
" int iGID = get_global_id(0);\n"
" if (iGID>=numElements)\n"
" return;\n"
"\n"
" float8 aGID = a[iGID];\n"
" float8 bGID = b[iGID];\n"
"\n"
" float8 result = aGID + bGID;\n"
" // write back out to GMEM\n"
" c[iGID] = result;\n"
"}\n"
"\n"
;

View File

@ -1,408 +0,0 @@
///VectorAdd sample, from the NVidia JumpStart Guide
///http://developer.download.nvidia.com/OpenCL/NVIDIA_OpenCL_JumpStart_Guide.pdf
///Instead of #include <CL/cl.h> we include <MiniCL/cl.h>
///Apart from this include file, all other code should compile and work on OpenCL compliant implementation
#define LOAD_FROM_FILE
#ifdef __APPLE__
#include <OpenCL/OpenCL.h>
#else
#include <CL/cl.h>
#endif //__APPLE__
#ifdef _WIN32
#pragma warning (disable:4996)
#endif
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#define GRID3DOCL_CHECKERROR(a, b) if((a)!=(b)) { printf("3D GRID OCL Error : %d\n", (a)); b3Assert((a) == (b)); }
size_t wgSize;
#include "VectorAddKernels.h"
#ifdef CL_PLATFORM_INTEL
const char* preferredPlatform = "Intel(R) Corporation";
#elif defined CL_PLATFORM_AMD
const char* preferredPlatform = "Advanced Micro Devices, Inc.";
#elif defined CL_PLATFORM_NVIDIA
const char* preferredPlatform = "NVIDIA Corporation";
#else
const char* preferredPlatform = "Unknown";
#endif
char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
{
// locals
FILE* pFileStream = NULL;
size_t szSourceLength;
// open the OpenCL source code file
pFileStream = fopen(cFilename, "rb");
if(pFileStream == 0)
{
return NULL;
}
size_t szPreambleLength = strlen(cPreamble);
// get the length of the source code
fseek(pFileStream, 0, SEEK_END);
szSourceLength = ftell(pFileStream);
fseek(pFileStream, 0, SEEK_SET);
// allocate a buffer for the source code string and read it in
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
memcpy(cSourceString, cPreamble, szPreambleLength);
fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream);
// close the file and return the total length of the combined (preamble + source) string
fclose(pFileStream);
if(szFinalLength != 0)
{
*szFinalLength = szSourceLength + szPreambleLength;
}
cSourceString[szSourceLength + szPreambleLength] = '\0';
return cSourceString;
}
size_t workitem_size[3];
void printDevInfo(cl_device_id device)
{
char device_string[1024];
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
printf( " Device %s:\n", device_string);
// CL_DEVICE_INFO
cl_device_type type;
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
if( type & CL_DEVICE_TYPE_CPU )
printf(" CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_CPU");
if( type & CL_DEVICE_TYPE_GPU )
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_GPU");
if( type & CL_DEVICE_TYPE_ACCELERATOR )
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
if( type & CL_DEVICE_TYPE_DEFAULT )
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
printf( " CL_DEVICE_MAX_COMPUTE_UNITS:\t%d\n", compute_units);
// CL_DEVICE_MAX_WORK_GROUP_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
printf( " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
}
// Main function
// *********************************************************************
int main(int argc, char **argv)
{
void *srcA, *srcB, *dst; // Host buffers for OpenCL test
cl_context cxGPUContext; // OpenCL context
cl_command_queue cqCommandQue; // OpenCL command que
cl_device_id* cdDevices; // OpenCL device list
cl_program cpProgram; // OpenCL program
cl_kernel ckKernel; // OpenCL kernel
cl_mem cmMemObjs[3]; // OpenCL memory buffer objects: 3 for device
size_t szGlobalWorkSize[1]; // 1D var for Total # of work items
size_t szLocalWorkSize[1]; // 1D var for # of work items in the work group
size_t szParmDataBytes; // Byte size of context information
cl_int ciErr1, ciErr2; // Error code var
int iTestN = 100000 * 8; // Size of Vectors to process
int actualGlobalSize = iTestN / 8;
// set Global and Local work size dimensions
szGlobalWorkSize[0] = iTestN >> 3; // do 8 computations per work item
szLocalWorkSize[0]= iTestN>>3;
// Allocate and initialize host arrays
srcA = (void *)malloc (sizeof(cl_float) * iTestN);
srcB = (void *)malloc (sizeof(cl_float) * iTestN);
dst = (void *)malloc (sizeof(cl_float) * iTestN);
int i;
// Initialize arrays with some values
for (i=0;i<iTestN;i++)
{
((cl_float*)srcA)[i] = cl_float(i);
((cl_float*)srcB)[i] = 2;
((cl_float*)dst)[i]=-1;
}
cl_uint numPlatforms;
cl_platform_id platform = NULL;
cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
if (0 < numPlatforms)
{
cl_platform_id* platforms = new cl_platform_id[numPlatforms];
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
for (unsigned i = 0; i < numPlatforms; ++i)
{
char pbuf[100];
status = clGetPlatformInfo(platforms[i],
CL_PLATFORM_VENDOR,
sizeof(pbuf),
pbuf,
NULL);
platform = platforms[i];
if (!strcmp(pbuf, preferredPlatform))
{
printf("Found platform %s\n", preferredPlatform);
break;
}
}
delete[] platforms;
}
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)platform,
0
};
// Create OpenCL context & context
cxGPUContext = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
// Query all devices available to the context
ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
cdDevices = (cl_device_id*)malloc(szParmDataBytes);
ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
if (cdDevices)
{
printDevInfo(cdDevices[0]);
}
// Create a command queue for first device the context reported
cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr2);
ciErr1 |= ciErr2;
// Allocate the OpenCL source and result buffer memory objects on the device GMEM
cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcA, &ciErr2);
ciErr1 |= ciErr2;
cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcB, &ciErr2);
ciErr1 |= ciErr2;
cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float8) * szGlobalWorkSize[0], NULL, &ciErr2);
ciErr1 |= ciErr2;
///create kernels from binary
int numDevices = 1;
::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t));
const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*));
for (i = 0; i < numDevices; ++i) {
images[i] = 0;
lengths[i] = 0;
}
// Read the OpenCL kernel in from source file
const char* cSourceFile = "opencl/vector_add/VectorAddKernels.cl";
const char* cPathAndName = cSourceFile;
#ifdef LOAD_FROM_FILE
size_t szKernelLength;
const char* cSourceCL =0;
char relativeFileName[1024];
{
const char* prefix[]={"../","../../","../../../","../../../../"};
int numPrefixes = sizeof(prefix)/sizeof(char*);
for (int i=0;!cSourceCL && i<numPrefixes;i++)
{
sprintf(relativeFileName,"%s%s",prefix[i],cSourceFile);
cSourceCL = loadProgSource(relativeFileName, "", &szKernelLength);
if (cSourceCL)
{
printf("Loaded program source: %s\n", relativeFileName);
}
}
}
if (!cSourceCL)
{
printf("Couldn't find file %s, exiting\n",cSourceFile);
exit(0);
}
#else
const char* cSourceCL = vectorAddCL;
size_t szKernelLength = strlen(cSourceCL);
#endif //LOAD_FROM_FILE
// Create the program
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
printf("clCreateProgramWithSource...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
exit(0);
}
// Build the program with 'mad' Optimization option
#ifdef MAC
char* flags = "-cl-mad-enable -DMAC ";
#else
char flags[1024]={0};
#ifdef CL_PLATFORM_INTEL
sprintf(flags,"-g -s \"%s\"","C:/develop/experiments/opencl/vector_add/VectorAddKernels.cl");
#endif//CL_PLATFORM_INTEL
#endif//MAC
ciErr1 = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
printf("clBuildProgram...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
exit(0);
}
// Create the kernel
ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
printf("clCreateKernel (VectorAdd)...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
exit(0);
}
cl_int ciErrNum;
ciErrNum = clGetKernelWorkGroupInfo(ckKernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
if (ciErrNum != CL_SUCCESS)
{
printf("cannot get workgroup size\n");
exit(0);
}
// Set the Argument values
ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]);
ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]);
ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]);
ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(int), (void*)&actualGlobalSize);
printf("Press ENTER to quit\n");
getchar();
int workgroupSize = wgSize;
if(workgroupSize <= 0)
{ // let OpenCL library calculate workgroup size
size_t globalWorkSize[2];
globalWorkSize[0] = actualGlobalSize;
globalWorkSize[1] = 1;
// Copy input data from host to GPU and launch kernel
ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalWorkSize, NULL, 0,0,0 );
}
else
{
size_t localWorkSize[2], globalWorkSize[2];
//workgroupSize = b3Min(workgroupSize, actualGlobalSize);
int num_t = actualGlobalSize / workgroupSize;
int num_g = num_t * workgroupSize;
if(num_g < actualGlobalSize)
{
num_t++;
//this can cause problems -> processing outside of the buffer
//make sure to check kernel
}
size_t globalThreads[] = {num_t * workgroupSize};
size_t localThreads[] = {workgroupSize};
localWorkSize[0] = workgroupSize;
globalWorkSize[0] = num_t * workgroupSize;
localWorkSize[1] = 1;
globalWorkSize[1] = 1;
// Copy input data from host to GPU and launch kernel
ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL);
}
if (ciErrNum != CL_SUCCESS)
{
printf("cannot clEnqueueNDRangeKernel\n");
exit(0);
}
clFinish(cqCommandQue);
// Read back results and check accumulated errors
ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL);
// Release kernel, program, and memory objects
// NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
free(cdDevices);
clReleaseKernel(ckKernel);
clReleaseProgram(cpProgram);
clReleaseCommandQueue(cqCommandQue);
clReleaseContext(cxGPUContext);
// print the results
int iErrorCount = 0;
for (i = 0; i < iTestN; i++)
{
if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i])
iErrorCount++;
}
if (iErrorCount)
{
printf("Validation FAILED\n");
} else
{
printf("Validation SUCCESSFULL\n");
}
// Free host memory, close log and return success
for (i = 0; i < 3; i++)
{
clReleaseMemObject(cmMemObjs[i]);
}
free(srcA);
free(srcB);
free (dst);
printf("Press ENTER to quit\n");
getchar();
}

View File

@ -1,69 +0,0 @@
///original author: Erwin Coumans
#include "b3OpenCLUtils.h"
#include "../parallel_primitives/host/b3OpenCLArray.h"
#include "../parallel_primitives/host/b3LauncherCL.h"
#include <stdio.h>
#define MSTRINGIFY(A) #A
const char* kernelString= MSTRINGIFY(
__kernel void VectorAdd(__global const float* a, __global const float* b, __global float* c, int numElements)
{
int iGID = get_global_id(0);
if (iGID>=numElements)
return;
float aGID = a[iGID];
float bGID = b[iGID];
float result = aGID + bGID;
c[iGID] = result;
}
);
int main(int argc, char* argv[])
{
int ciErrNum = 0;
int preferred_device = -1;
int preferred_platform = -1;
cl_platform_id platformId;
cl_context ctx;
cl_command_queue queue;
cl_device_id device;
cl_kernel addKernel;
ctx = b3OpenCLUtils::createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
b3OpenCLUtils::printPlatformInfo(platformId);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
if (!ctx) {
printf("No OpenCL capable GPU found!");
return 0;
}
device = b3OpenCLUtils::getDevice(ctx,0);
queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
addKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,kernelString,"VectorAdd",&ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numElements = 32;
b3OpenCLArray<float> a(ctx,queue);
b3OpenCLArray<float> b(ctx,queue);
b3OpenCLArray<float> c(ctx,queue);
for (int i=0;i<numElements;i++)
{
a.push_back(float(i));
b.push_back(float(i));
}
c.resize(numElements);
b3LauncherCL launcher( queue, addKernel);
launcher.setBuffer( a.getBufferCL());
launcher.setBuffer( b.getBufferCL());
launcher.setBuffer( c.getBufferCL());
launcher.setConst( numElements );
launcher.launch1D( numElements);
for (int i=0;i<numElements;i++)
{
float v = c.at(i);
printf("c[%d]=%f\n",i,v);
}
clReleaseCommandQueue(queue);
clReleaseContext(ctx);
return 0;
}

View File

@ -1,41 +0,0 @@
function createProject (vendor)
local hasCL = findOpenCL(vendor)
if (hasCL) then
project ( "OpenCL_vector_add_simplified_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../bin"
links {
"OpenCL_lib_parallel_primitives_host_" .. vendor
}
includedirs {
"../basic_initialize",
"../../src"
}
files {
"main.cpp",
"../basic_initialize/b3OpenCLUtils.cpp",
"../basic_initialize/b3OpenCLUtils.h",
"../../src/Bullet3Common/b3AlignedAllocator.cpp",
"../../src/Bullet3Common/b3AlignedAllocator.h",
"../../src/Bullet3Common/b3AlignedObjectArray.h",
}
end
end
createProject("AMD")
createProject("NVIDIA")
createProject("Intel")
createProject("Apple")

View File

@ -1,15 +1,15 @@
#include "b3GpuSapBroadphase.h"
#include "Bullet3Common/b3Vector3.h"
#include "parallel_primitives/host/b3LauncherCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "Bullet3Common/b3Quickprof.h"
#include "basic_initialize/b3OpenCLUtils.h"
#include "../kernels/sapKernels.h"
#include "../kernels/sapFastKernels.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "kernels/sapKernels.h"
#include "kernels/sapFastKernels.h"
#include "Bullet3Common/b3MinMax.h"
#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
#define B3_BROADPHASE_SAPFAST_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl"
b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q )
:m_context(ctx),
@ -28,9 +28,9 @@ m_currentBuffer(-1)
cl_int errNum=0;
cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"","opencl/gpu_broadphase/kernels/sap.cl");
cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
b3Assert(errNum==CL_SUCCESS);
cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"","opencl/gpu_broadphase/kernels/sapFast.cl");
cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"",B3_BROADPHASE_SAPFAST_PATH);
b3Assert(errNum==CL_SUCCESS);

View File

@ -1,10 +1,10 @@
#ifndef B3_GPU_SAP_BROADPHASE_H
#define B3_GPU_SAP_BROADPHASE_H
#include "parallel_primitives/host/b3OpenCLArray.h"
#include "parallel_primitives/host/b3FillCL.h" //b3Int2
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
class b3Vector3;
#include "parallel_primitives/host/b3RadixSort32CL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
#include "b3SapAabb.h"

View File

@ -29,18 +29,23 @@ typedef b3AlignedObjectArray<b3Vector3> b3VertexArray;
#include "Bullet3Common/b3Quickprof.h"
#include <float.h> //for FLT_MAX
#include "basic_initialize/b3OpenCLUtils.h"
#include "parallel_primitives/host/b3LauncherCL.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
//#include "AdlQuaternion.h"
#include "../kernels/satKernels.h"
#include "../kernels/satClipHullContacts.h"
#include "../kernels/bvhTraversal.h"
#include "../kernels/primitiveContacts.h"
#include "kernels/satKernels.h"
#include "kernels/satClipHullContacts.h"
#include "kernels/bvhTraversal.h"
#include "kernels/primitiveContacts.h"
#include "Bullet3Geometry/b3AabbUtil.h"
#define BT_NARROWPHASE_SAT_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl"
#define BT_NARROWPHASE_CLIPHULL_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl"
#define BT_NARROWPHASE_BVH_TRAVERSAL_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl"
#define BT_NARROWPHASE_PRIMITIVE_CONTACT_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl"
#define dot3F4 b3Dot
@ -64,7 +69,7 @@ m_totalContactsOut(m_context, m_queue)
// sprintf(flags,"-g -s \"%s\"","C:/develop/bullet3_experiments2/opencl/gpu_narrowphase/kernels/sat.cl");
//#endif
cl_program satProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,src,&errNum,flags,"opencl/gpu_narrowphase/kernels/sat.cl");
cl_program satProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,src,&errNum,flags,BT_NARROWPHASE_SAT_PATH);
b3Assert(errNum==CL_SUCCESS);
m_findSeparatingAxisKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,src, "findSeparatingAxisKernel",&errNum,satProg );
@ -92,7 +97,7 @@ m_totalContactsOut(m_context, m_queue)
// sprintf(flags,"-g -s \"%s\"","C:/develop/bullet3_experiments2/opencl/gpu_narrowphase/kernels/satClipHullContacts.cl");
//#endif
cl_program satClipContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcClip,&errNum,flags,"opencl/gpu_narrowphase/kernels/satClipHullContacts.cl");
cl_program satClipContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcClip,&errNum,flags,BT_NARROWPHASE_CLIPHULL_PATH);
b3Assert(errNum==CL_SUCCESS);
m_clipHullHullKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipHullHullKernel",&errNum,satClipContactsProg);
@ -132,7 +137,7 @@ m_totalContactsOut(m_context, m_queue)
if (1)
{
const char* srcBvh = bvhTraversalKernelCL;
cl_program bvhTraversalProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcBvh,&errNum,"","opencl/gpu_narrowphase/kernels/bvhTraversal.cl");
cl_program bvhTraversalProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcBvh,&errNum,"",BT_NARROWPHASE_BVH_TRAVERSAL_PATH);
b3Assert(errNum==CL_SUCCESS);
m_bvhTraversalKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcBvh, "bvhTraversalKernel",&errNum,bvhTraversalProg,"");
@ -142,7 +147,7 @@ m_totalContactsOut(m_context, m_queue)
{
const char* primitiveContactsSrc = primitiveContactsKernelsCL;
cl_program primitiveContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,primitiveContactsSrc,&errNum,"","opencl/gpu_narrowphase/kernels/primitiveContacts.cl");
cl_program primitiveContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,primitiveContactsSrc,&errNum,"",BT_NARROWPHASE_PRIMITIVE_CONTACT_PATH);
b3Assert(errNum==CL_SUCCESS);
m_primitiveContactsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,primitiveContactsSrc, "primitiveContactsKernel",&errNum,primitiveContactsProg,"");
@ -527,7 +532,7 @@ void computeContactPlaneConvex(int pairIndex,
b3Vector3 pOnB1 = contactPoints[contactIdx.s[i]];
c->m_worldPos[i] = pOnB1;
}
c->m_worldNormal[3] = numReducedPoints;
c->m_worldNormal[3] = (b3Scalar)numReducedPoints;
}//if (dstIdx < numPairs)
}
@ -665,7 +670,7 @@ void computeContactPlaneCompound(int pairIndex,
b3Vector3 pOnB1 = contactPoints[contactIdx.s[i]];
c->m_worldPos[i] = pOnB1;
}
c->m_worldNormal[3] = numReducedPoints;
c->m_worldNormal[3] = (b3Scalar)numReducedPoints;
}//if (dstIdx < numPairs)
}
@ -825,7 +830,7 @@ void computeContactSphereConvex(int pairIndex,
c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
c->m_worldPos[0] = pOnB1;
int numPoints = 1;
c->m_worldNormal[3] = numPoints;
c->m_worldNormal[3] = (b3Scalar)numPoints;
}//if (dstIdx < numPairs)
}
}//if (hasCollision)

View File

@ -2,15 +2,15 @@
#ifndef _CONVEX_HULL_CONTACT_H
#define _CONVEX_HULL_CONTACT_H
#include "parallel_primitives/host/b3OpenCLArray.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3RigidBodyCL.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "b3ConvexUtility.h"
#include "b3ConvexPolyhedronCL.h"
#include "b3Collidable.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
#include "parallel_primitives/host/b3Int2.h"
#include "parallel_primitives/host/b3Int4.h"
#include "Bullet3Common/b3Int2.h"
#include "Bullet3Common/b3Int4.h"
#include "b3OptimizedBvh.h"
#include "b3BvhInfo.h"

Some files were not shown because too many files have changed in this diff Show More