mirror of
https://github.com/bulletphysics/bullet3
synced 2024-12-14 13:50:04 +00:00
reorder files, in preparation for Bullet 3 -> Bullet 2 merge
This commit is contained in:
parent
55b69201a9
commit
3ac332f3a7
Demos3
GpuDemos
GpuDemo.cppGpuDemo.hGpuDemoInternalData.hParticleDemo.cppParticleDemo.hParticleKernels.cl
broadphase
gwenUserInterface.cppgwenUserInterface.hmain_opengl3core.cpppremake4.luarigidbody
GpuGuiInitialize
Wavefront
list.cpplist.hobjLoader.cppobjLoader.hobjTester.cppobj_parser.cppobj_parser.hpremake4.luastring_extra.cppstring_extra.h
donttouch
btgui/GwenOpenGLTest
build
opencl
basic_initialize
gpu_broadphase/test
gpu_narrowphase/test
lds_bank_conflict
parallel_primitives
reduce
vector_add
vector_add_simplified
src
Bullet3Common
Bullet3OpenCL
BroadphaseCollision
Initialize
NarrowphaseCollision
b3BvhInfo.hb3Collidable.hb3ConvexHullContact.cppb3ConvexHullContact.hb3ConvexPolyhedronCL.hb3ConvexUtility.cppb3ConvexUtility.hb3OptimizedBvh.cppb3OptimizedBvh.hb3QuantizedBvh.cppb3QuantizedBvh.hb3StridingMeshInterface.cppb3StridingMeshInterface.hb3TriangleCallback.cppb3TriangleCallback.hb3TriangleIndexVertexArray.cppb3TriangleIndexVertexArray.h
kernels
@ -1,7 +1,7 @@
|
||||
#include "GpuDemo.h"
|
||||
#include "GpuDemoInternalData.h"
|
||||
#include "Bullet3Common/b3Scalar.h"
|
||||
#include "basic_initialize/b3OpenCLUtils.h"
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "OpenGLWindow/ShapeData.h"
|
||||
#include "OpenGLWindow/GLInstancingRenderer.h"
|
||||
|
@ -1,7 +1,7 @@
|
||||
#ifndef GPU_DEMO_INTERNAL_DATA_H
|
||||
#define GPU_DEMO_INTERNAL_DATA_H
|
||||
|
||||
#include "basic_initialize/b3OpenCLInclude.h"
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
|
||||
|
||||
struct GpuDemoInternalData
|
||||
{
|
@ -2,7 +2,7 @@
|
||||
|
||||
#include "OpenGLWindow/GLInstancingRenderer.h"
|
||||
#include "OpenGLWindow/ShapeData.h"
|
||||
#include "basic_initialize/b3OpenCLUtils.h"
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
|
||||
#define MSTRINGIFY(A) #A
|
||||
static char* particleKernelsString =
|
||||
@ -12,10 +12,10 @@ static char* particleKernelsString =
|
||||
#include "Bullet3Common/b3Vector3.h"
|
||||
#include "OpenGLWindow/OpenGLInclude.h"
|
||||
#include "OpenGLWindow/GLInstanceRendererInternalData.h"
|
||||
#include "parallel_primitives/host/b3LauncherCL.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
|
||||
//#include "../../opencl/primitives/AdlPrimitives/Math/Math.h"
|
||||
//#include "../../opencl/broadphase_benchmark/b3GridBroadphaseCL.h"
|
||||
#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
|
||||
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
|
||||
#include "GpuDemoInternalData.h"
|
||||
|
||||
|
@ -4,12 +4,12 @@
|
||||
#include "OpenGLWindow/GLInstancingRenderer.h"
|
||||
#include "Bullet3Common/b3Quaternion.h"
|
||||
#include "OpenGLWindow/b3gWindowInterface.h"
|
||||
#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
|
||||
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
|
||||
#include "../GpuDemoInternalData.h"
|
||||
#include "basic_initialize/b3OpenCLUtils.h"
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "OpenGLWindow/OpenGLInclude.h"
|
||||
#include "OpenGLWindow/GLInstanceRendererInternalData.h"
|
||||
#include "parallel_primitives/host/b3LauncherCL.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
|
||||
|
||||
static b3KeyboardCallback oldCallback = 0;
|
||||
extern bool gReset;
|
@ -4,7 +4,7 @@ function createProject(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ("Bullet3_OpenCL_gpu_demo_" .. vendor)
|
||||
project ("App_Bullet3_OpenCL_Demos_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
@ -20,8 +20,7 @@ function createProject(vendor)
|
||||
includedirs {
|
||||
"..",
|
||||
"../../src",
|
||||
"../../btgui",
|
||||
"../../opencl"
|
||||
"../../btgui"
|
||||
}
|
||||
|
||||
links {
|
||||
@ -30,21 +29,23 @@ function createProject(vendor)
|
||||
"Bullet3Geometry",
|
||||
"Bullet3Collision",
|
||||
"Bullet3Dynamics",
|
||||
"Bullet2FileLoader"
|
||||
"Bullet2FileLoader",
|
||||
"Bullet3OpenCL_" .. vendor
|
||||
|
||||
}
|
||||
|
||||
files {
|
||||
"**.cpp",
|
||||
"**.h",
|
||||
|
||||
"../ObjLoader/string_extra.cpp",
|
||||
"../ObjLoader/string_extra.h",
|
||||
"../ObjLoader/objLoader.cpp",
|
||||
"../ObjLoader/objLoader.h",
|
||||
"../ObjLoader/obj_parser.cpp",
|
||||
"../ObjLoader/obj_parser.h",
|
||||
"../ObjLoader/list.cpp",
|
||||
"../ObjLoader/list.h",
|
||||
"../Wavefront/string_extra.cpp",
|
||||
"../Wavefront/string_extra.h",
|
||||
"../Wavefront/objLoader.cpp",
|
||||
"../Wavefront/objLoader.h",
|
||||
"../Wavefront/obj_parser.cpp",
|
||||
"../Wavefront/obj_parser.h",
|
||||
"../Wavefront/list.cpp",
|
||||
"../Wavefront/list.h",
|
||||
|
||||
|
||||
"../../btgui/OpenGLWindow/GLInstancingRenderer.cpp",
|
||||
@ -60,21 +61,6 @@ function createProject(vendor)
|
||||
"../../btgui/OpenGLTrueTypeFont/opengl_fontstashcallbacks.cpp",
|
||||
"../../btgui/OpenGLTrueTypeFont/opengl_fontstashcallbacks.h",
|
||||
"../../btgui/FontFiles/OpenSans.cpp",
|
||||
"../../opencl/basic_initialize/b3OpenCLUtils.cpp",
|
||||
"../../opencl/basic_initialize/b3OpenCLUtils.h",
|
||||
"../../opencl/gpu_broadphase/host/b3GpuSapBroadphase.cpp",
|
||||
"../../opencl/gpu_narrowphase/host/**.cpp",
|
||||
"../../opencl/gpu_narrowphase/host/**.h",
|
||||
"../../opencl/parallel_primitives/host/b3BoundSearchCL.cpp",
|
||||
"../../opencl/parallel_primitives/host/b3BoundSearchCL.h",
|
||||
"../../opencl/parallel_primitives/host/b3FillCL.cpp",
|
||||
"../../opencl/parallel_primitives/host/b3FillCL.h",
|
||||
"../../opencl/parallel_primitives/host/b3PrefixScanCL.cpp",
|
||||
"../../opencl/parallel_primitives/host/b3PrefixScanCL.h",
|
||||
"../../opencl/parallel_primitives/host/b3RadixSort32CL.cpp",
|
||||
"../../opencl/parallel_primitives/host/b3RadixSort32CL.h",
|
||||
"../../opencl/gpu_rigidbody/host/**.cpp",
|
||||
"../../opencl/gpu_rigidbody/host/**.h",
|
||||
|
||||
}
|
||||
|
@ -22,11 +22,11 @@ extern bool enableExperimentalCpuConcaveCollision;
|
||||
//#include "LinearMath/b3Quickprof.h"
|
||||
#include "Bullet3Common/b3Quaternion.h"
|
||||
#include "Bullet3Common/b3Matrix3x3.h"
|
||||
#include "gpu_narrowphase/host/b3ConvexUtility.h"
|
||||
#include "Bullet3OpenCL/NarrowphaseCollision/b3ConvexUtility.h"
|
||||
#include "OpenGLWindow/ShapeData.h"
|
||||
#include "../../ObjLoader/objLoader.h"
|
||||
#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
|
||||
#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
|
||||
#include "../../Wavefront/objLoader.h"
|
||||
#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
|
||||
#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"
|
||||
|
||||
///work-in-progress
|
||||
///This ReadBulletSample is kept as simple as possible without dependencies to the Bullet SDK.
|
@ -6,17 +6,17 @@
|
||||
#include "OpenGLWindow/GLInstancingRenderer.h"
|
||||
#include "Bullet3Common/b3Quaternion.h"
|
||||
#include "OpenGLWindow/b3gWindowInterface.h"
|
||||
#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
|
||||
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
|
||||
#include "../GpuDemoInternalData.h"
|
||||
#include "basic_initialize/b3OpenCLUtils.h"
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "OpenGLWindow/OpenGLInclude.h"
|
||||
#include "OpenGLWindow/GLInstanceRendererInternalData.h"
|
||||
#include "parallel_primitives/host/b3LauncherCL.h"
|
||||
#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
|
||||
#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
|
||||
#include "gpu_rigidbody/host/b3Config.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
|
||||
#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
|
||||
#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"
|
||||
#include "Bullet3OpenCL/RigidBody/b3Config.h"
|
||||
#include "GpuRigidBodyDemoInternalData.h"
|
||||
#include"../../ObjLoader/objLoader.h"
|
||||
#include"../../Wavefront/objLoader.h"
|
||||
#include "Bullet3Common/b3Transform.h"
|
||||
|
||||
#include "OpenGLWindow/GLInstanceGraphicsShape.h"
|
@ -6,15 +6,15 @@
|
||||
#include "OpenGLWindow/GLInstancingRenderer.h"
|
||||
#include "Bullet3Common/b3Quaternion.h"
|
||||
#include "OpenGLWindow/b3gWindowInterface.h"
|
||||
#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
|
||||
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
|
||||
#include "../GpuDemoInternalData.h"
|
||||
#include "basic_initialize/b3OpenCLUtils.h"
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "OpenGLWindow/OpenGLInclude.h"
|
||||
#include "OpenGLWindow/GLInstanceRendererInternalData.h"
|
||||
#include "parallel_primitives/host/b3LauncherCL.h"
|
||||
#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
|
||||
#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
|
||||
#include "gpu_rigidbody/host/b3Config.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
|
||||
#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
|
||||
#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"
|
||||
#include "Bullet3OpenCL/RigidBody/b3Config.h"
|
||||
#include "GpuRigidBodyDemoInternalData.h"
|
||||
#include "Bullet3Common/b3Transform.h"
|
||||
|
@ -6,15 +6,15 @@
|
||||
#include "OpenGLWindow/GLInstancingRenderer.h"
|
||||
#include "Bullet3Common/b3Quaternion.h"
|
||||
#include "OpenGLWindow/b3gWindowInterface.h"
|
||||
#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
|
||||
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
|
||||
#include "../GpuDemoInternalData.h"
|
||||
#include "basic_initialize/b3OpenCLUtils.h"
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "OpenGLWindow/OpenGLInclude.h"
|
||||
#include "OpenGLWindow/GLInstanceRendererInternalData.h"
|
||||
#include "parallel_primitives/host/b3LauncherCL.h"
|
||||
#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
|
||||
#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
|
||||
#include "gpu_rigidbody/host/b3Config.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
|
||||
#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
|
||||
#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"
|
||||
#include "Bullet3OpenCL/RigidBody/b3Config.h"
|
||||
#include "GpuRigidBodyDemoInternalData.h"
|
||||
#include "../gwenUserInterface.h"
|
||||
#include "Bullet3Dynamics/ConstraintSolver/b3Point2PointConstraint.h"
|
@ -4,15 +4,15 @@
|
||||
#include "OpenGLWindow/GLInstancingRenderer.h"
|
||||
#include "Bullet3Common/b3Quaternion.h"
|
||||
#include "OpenGLWindow/b3gWindowInterface.h"
|
||||
#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
|
||||
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
|
||||
#include "../GpuDemoInternalData.h"
|
||||
#include "basic_initialize/b3OpenCLUtils.h"
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "OpenGLWindow/OpenGLInclude.h"
|
||||
#include "OpenGLWindow/GLInstanceRendererInternalData.h"
|
||||
#include "parallel_primitives/host/b3LauncherCL.h"
|
||||
#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
|
||||
#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
|
||||
#include "gpu_rigidbody/host/b3Config.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
|
||||
#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
|
||||
#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"
|
||||
#include "Bullet3OpenCL/RigidBody/b3Config.h"
|
||||
#include "GpuRigidBodyDemoInternalData.h"
|
||||
#include "Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.h"
|
||||
|
@ -1,8 +1,8 @@
|
||||
#ifndef GPU_RIGIDBODY_INTERNAL_DATA_H
|
||||
#define GPU_RIGIDBODY_INTERNAL_DATA_H
|
||||
|
||||
#include "basic_initialize/b3OpenCLUtils.h"
|
||||
#include "parallel_primitives/host/b3OpenCLArray.h"
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
|
||||
#include "Bullet3Common/b3Vector3.h"
|
||||
|
||||
struct GpuRigidBodyDemoInternalData
|
@ -2,19 +2,18 @@
|
||||
#include "GpuRigidBodyDemo.h"
|
||||
#include "Bullet3Common/b3Quickprof.h"
|
||||
#include "OpenGLWindow/ShapeData.h"
|
||||
|
||||
#include "OpenGLWindow/GLInstancingRenderer.h"
|
||||
#include "Bullet3Common/b3Quaternion.h"
|
||||
#include "OpenGLWindow/b3gWindowInterface.h"
|
||||
#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
|
||||
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
|
||||
#include "../GpuDemoInternalData.h"
|
||||
#include "basic_initialize/b3OpenCLUtils.h"
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "OpenGLWindow/OpenGLInclude.h"
|
||||
#include "OpenGLWindow/GLInstanceRendererInternalData.h"
|
||||
#include "parallel_primitives/host/b3LauncherCL.h"
|
||||
#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
|
||||
#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
|
||||
#include "gpu_rigidbody/host/b3Config.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
|
||||
#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
|
||||
#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"
|
||||
#include "Bullet3OpenCL/RigidBody/b3Config.h"
|
||||
#include "GpuRigidBodyDemoInternalData.h"
|
||||
#include "../gwenUserInterface.h"
|
||||
|
@ -1,5 +1,5 @@
|
||||
|
||||
project "Gwen_OpenGLTest"
|
||||
project "Test_Gwen_OpenGL"
|
||||
|
||||
kind "ConsoleApp"
|
||||
flags {"Unicode"}
|
||||
|
@ -91,30 +91,35 @@
|
||||
|
||||
|
||||
if not _OPTIONS["ios"] then
|
||||
include "../demo/gpudemo"
|
||||
include "../btgui/MidiTest"
|
||||
-- include "../demo/gpudemo"
|
||||
-- include "../btgui/MidiTest"
|
||||
-- include "../opencl/vector_add_simplified"
|
||||
-- include "../opencl/vector_add"
|
||||
include "../opencl/basic_initialize"
|
||||
include "../demo/gpu_initialize"
|
||||
include "../opencl/parallel_primitives/host"
|
||||
include "../opencl/parallel_primitives/test"
|
||||
include "../opencl/parallel_primitives/benchmark"
|
||||
include "../opencl/lds_bank_conflict"
|
||||
-- include "../opencl/reduce"
|
||||
-- include "../opencl/gpu_broadphase/test"
|
||||
-- include "../opencl/gpu_narrowphase/test"
|
||||
include "../btgui/Gwen"
|
||||
include "../btgui/GwenOpenGLTest"
|
||||
include "../btgui/OpenGLTrueTypeFont"
|
||||
-- include "../btgui/OpenGLWindow"
|
||||
-- include "../demo/ObjLoader"
|
||||
|
||||
include "../test/OpenCL/BasicInitialize"
|
||||
include "../test/OpenCL/BroadphaseCollision"
|
||||
include "../test/OpenCL/NarrowphaseCollision"
|
||||
include "../test/OpenCL/ParallelPrimitives"
|
||||
|
||||
include "../src/Bullet3Dynamics"
|
||||
include "../src/Bullet3Common"
|
||||
include "../src/Bullet3Geometry"
|
||||
include "../src/Bullet3Collision"
|
||||
include "../src/Bullet3Serialize/Bullet2FileLoader"
|
||||
|
||||
include "../src/Bullet3OpenCL"
|
||||
include "../Demos3/GpuDemos"
|
||||
|
||||
-- include "../demo/gpu_initialize"
|
||||
-- include "../opencl/lds_bank_conflict"
|
||||
-- include "../opencl/reduce"
|
||||
-- include "../btgui/OpenGLTrueTypeFont"
|
||||
-- include "../btgui/OpenGLWindow"
|
||||
-- include "../demo/ObjLoader"
|
||||
|
||||
|
||||
include "../test/b3DynamicBvhBroadphase"
|
||||
-- include "../test/b3DynamicBvhBroadphase"
|
||||
|
||||
end
|
||||
|
@ -1,28 +0,0 @@
|
||||
function createProject(vendor)
|
||||
|
||||
hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ("OpenCL_intialize_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../bin"
|
||||
|
||||
files {
|
||||
"main.cpp",
|
||||
"b3OpenCLUtils.cpp",
|
||||
"b3OpenCLUtils.h"
|
||||
}
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
createProject("Apple")
|
||||
createProject("AMD")
|
||||
createProject("Intel")
|
||||
createProject("NVIDIA")
|
@ -1,129 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include "../basic_initialize/b3OpenCLUtils.h"
|
||||
#include "../host/b3GpuSapBroadphase.h"
|
||||
#include "Bullet3Common/b3Vector3.h"
|
||||
#include "parallel_primitives/host/b3FillCL.h"
|
||||
#include "parallel_primitives/host/b3BoundSearchCL.h"
|
||||
#include "parallel_primitives/host/b3RadixSort32CL.h"
|
||||
#include "parallel_primitives/host/b3PrefixScanCL.h"
|
||||
#include "Bullet3Common/b3CommandLineArgs.h"
|
||||
#include "Bullet3Common/b3MinMax.h"
|
||||
|
||||
int g_nPassed = 0;
|
||||
int g_nFailed = 0;
|
||||
bool g_testFailed = 0;
|
||||
|
||||
#define TEST_INIT g_testFailed = 0;
|
||||
#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
|
||||
#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
|
||||
#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
|
||||
|
||||
cl_context g_context=0;
|
||||
cl_device_id g_device=0;
|
||||
cl_command_queue g_queue =0;
|
||||
const char* g_deviceName = 0;
|
||||
|
||||
void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
|
||||
{
|
||||
void* glCtx=0;
|
||||
void* glDC = 0;
|
||||
int ciErrNum = 0;
|
||||
//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
|
||||
|
||||
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
|
||||
|
||||
g_context = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
int numDev = b3OpenCLUtils::getNumDevices(g_context);
|
||||
if (numDev>0)
|
||||
{
|
||||
b3OpenCLDeviceInfo info;
|
||||
g_device= b3OpenCLUtils::getDevice(g_context,0);
|
||||
g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
b3OpenCLUtils::printDeviceInfo(g_device);
|
||||
b3OpenCLUtils::getDeviceInfo(g_device,&info);
|
||||
g_deviceName = info.m_deviceName;
|
||||
}
|
||||
}
|
||||
|
||||
void exitCL()
|
||||
{
|
||||
clReleaseCommandQueue(g_queue);
|
||||
clReleaseContext(g_context);
|
||||
}
|
||||
|
||||
|
||||
inline void broadphaseTest()
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
b3GpuSapBroadphase* sap = new b3GpuSapBroadphase(g_context,g_device,g_queue);
|
||||
int group=1;
|
||||
int mask=1;
|
||||
b3Vector3 aabbMin(0,0,0);
|
||||
b3Vector3 aabbMax(1,1,1);
|
||||
int usrPtr = 1;
|
||||
sap->createProxy(aabbMin,aabbMax,usrPtr,group,mask);
|
||||
|
||||
aabbMin.setValue(1,1,1);
|
||||
aabbMax.setValue(2,2,2);
|
||||
|
||||
usrPtr = 2;
|
||||
sap->createProxy(aabbMin,aabbMax,usrPtr,group,mask);
|
||||
sap->writeAabbsToGpu();
|
||||
|
||||
sap->calculateOverlappingPairs();
|
||||
|
||||
int numOverlap = sap->getNumOverlap();
|
||||
cl_mem buf = sap->getOverlappingPairBuffer();
|
||||
|
||||
TEST_ASSERT(numOverlap==1);
|
||||
|
||||
delete sap;
|
||||
|
||||
TEST_REPORT( "broadphaseTest" );
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
int preferredDeviceIndex = -1;
|
||||
int preferredPlatformIndex = -1;
|
||||
|
||||
b3CommandLineArgs args(argc, argv);
|
||||
args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
|
||||
args.GetCmdLineArgument("platformId", preferredPlatformIndex);
|
||||
|
||||
initCL(preferredDeviceIndex,preferredPlatformIndex);
|
||||
|
||||
|
||||
broadphaseTest();
|
||||
|
||||
printf("%d tests passed\n",g_nPassed, g_nFailed);
|
||||
if (g_nFailed)
|
||||
{
|
||||
printf("%d tests failed\n",g_nFailed);
|
||||
}
|
||||
printf("End, press <enter>\n");
|
||||
|
||||
getchar();
|
||||
|
||||
exitCL();
|
||||
|
||||
}
|
||||
|
@ -1,46 +0,0 @@
|
||||
function createProject(vendor)
|
||||
hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ("OpenCL_broadphase_test_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../../bin"
|
||||
includedirs {"..","../..","../../../src"}
|
||||
|
||||
|
||||
files {
|
||||
"main.cpp",
|
||||
"../../basic_initialize/b3OpenCLInclude.h",
|
||||
"../../basic_initialize/b3OpenCLUtils.cpp",
|
||||
"../../basic_initialize/b3OpenCLUtils.h",
|
||||
"../host/b3GpuSapBroadphase.cpp",
|
||||
"../host/b3GpuSapBroadphase.h",
|
||||
"../../parallel_primitives/host/btFillCL.cpp",
|
||||
"../../parallel_primitives/host/btFillCL.h",
|
||||
"../../parallel_primitives/host/btBoundSearchCL.cpp",
|
||||
"../../parallel_primitives/host/btBoundSearchCL.h",
|
||||
"../../parallel_primitives/host/btPrefixScanCL.cpp",
|
||||
"../../parallel_primitives/host/btPrefixScanCL.h",
|
||||
"../../parallel_primitives/host/btRadixSort32CL.cpp",
|
||||
"../../parallel_primitives/host/btRadixSort32CL.h",
|
||||
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
|
||||
"../../../src/Bullet3Common/b3AlignedAllocator.h",
|
||||
"../../../src/Bullet3Common/b3AlignedObjectArray.h",
|
||||
"../../../src/Bullet3Common/b3Quickprof.cpp",
|
||||
"../../../src/Bullet3Common/b3Quickprof.h",
|
||||
|
||||
}
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("Intel")
|
||||
createProject("NVIDIA")
|
||||
createProject("Apple")
|
@ -1,111 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include "../basic_initialize/b3OpenCLUtils.h"
|
||||
#include "../host/b3ConvexHullContact.h"
|
||||
|
||||
#include "Bullet3Common/b3Vector3.h"
|
||||
#include "parallel_primitives/host/b3FillCL.h"
|
||||
#include "parallel_primitives/host/b3BoundSearchCL.h"
|
||||
#include "parallel_primitives/host/b3RadixSort32CL.h"
|
||||
#include "parallel_primitives/host/b3PrefixScanCL.h"
|
||||
#include "Bullet3Common/b3CommandLineArgs.h"
|
||||
#include "../host/b3ConvexHullContact.h"
|
||||
|
||||
#include "Bullet3Common/b3MinMax.h"
|
||||
int g_nPassed = 0;
|
||||
int g_nFailed = 0;
|
||||
bool g_testFailed = 0;
|
||||
|
||||
#define TEST_INIT g_testFailed = 0;
|
||||
#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
|
||||
#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
|
||||
#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
|
||||
|
||||
cl_context g_context=0;
|
||||
cl_device_id g_device=0;
|
||||
cl_command_queue g_queue =0;
|
||||
const char* g_deviceName = 0;
|
||||
|
||||
void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
|
||||
{
|
||||
void* glCtx=0;
|
||||
void* glDC = 0;
|
||||
int ciErrNum = 0;
|
||||
//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
|
||||
|
||||
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
|
||||
|
||||
g_context = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
int numDev = b3OpenCLUtils::getNumDevices(g_context);
|
||||
if (numDev>0)
|
||||
{
|
||||
b3OpenCLDeviceInfo info;
|
||||
g_device= b3OpenCLUtils::getDevice(g_context,0);
|
||||
g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
b3OpenCLUtils::printDeviceInfo(g_device);
|
||||
b3OpenCLUtils::getDeviceInfo(g_device,&info);
|
||||
g_deviceName = info.m_deviceName;
|
||||
}
|
||||
}
|
||||
|
||||
void exitCL()
|
||||
{
|
||||
clReleaseCommandQueue(g_queue);
|
||||
clReleaseContext(g_context);
|
||||
}
|
||||
|
||||
|
||||
inline void gpuConvexHullContactTest()
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
TEST_ASSERT(1);
|
||||
|
||||
GpuSatCollision* sat = new GpuSatCollision(g_context,g_device,g_queue);
|
||||
|
||||
delete sat;
|
||||
|
||||
TEST_REPORT( "gpuConvexHullContactTest" );
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
int preferredDeviceIndex = -1; int preferredPlatformIndex = -1;
|
||||
|
||||
b3CommandLineArgs args(argc, argv);
|
||||
args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
|
||||
args.GetCmdLineArgument("platformId", preferredPlatformIndex);
|
||||
|
||||
initCL(preferredDeviceIndex,preferredPlatformIndex);
|
||||
|
||||
gpuConvexHullContactTest();
|
||||
|
||||
printf("%d tests passed\n",g_nPassed, g_nFailed);
|
||||
if (g_nFailed)
|
||||
{
|
||||
printf("%d tests failed\n",g_nFailed);
|
||||
}
|
||||
printf("End, press <enter>\n");
|
||||
|
||||
getchar();
|
||||
|
||||
exitCL();
|
||||
|
||||
}
|
||||
|
@ -1,49 +0,0 @@
|
||||
function createProject(vendor)
|
||||
hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ("OpenCL_sat_test_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../../bin"
|
||||
includedirs {"..","../..","../../../src"}
|
||||
|
||||
|
||||
files {
|
||||
"main.cpp",
|
||||
"../../basic_initialize/b3OpenCLInclude.h",
|
||||
"../../basic_initialize/b3OpenCLUtils.cpp",
|
||||
"../../basic_initialize/b3OpenCLUtils.h",
|
||||
"../host/**.cpp",
|
||||
"../host/**.h",
|
||||
"../../parallel_primitives/host/btFillCL.cpp",
|
||||
"../../parallel_primitives/host/btFillCL.h",
|
||||
"../../parallel_primitives/host/btBoundSearchCL.cpp",
|
||||
"../../parallel_primitives/host/btBoundSearchCL.h",
|
||||
"../../parallel_primitives/host/btPrefixScanCL.cpp",
|
||||
"../../parallel_primitives/host/btPrefixScanCL.h",
|
||||
"../../parallel_primitives/host/btRadixSort32CL.cpp",
|
||||
"../../parallel_primitives/host/btRadixSort32CL.h",
|
||||
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
|
||||
"../../../src/Bullet3Common/b3AlignedAllocator.h",
|
||||
"../../../src/Bullet3Common/b3AlignedObjectArray.h",
|
||||
"../../../src/Bullet3Common/b3Quickprof.cpp",
|
||||
"../../../src/Bullet3Common/b3Quickprof.h",
|
||||
"../../../src/Bullet3Geometry/**.cpp",
|
||||
"../../../src/Bullet3Geometry/**.h",
|
||||
|
||||
|
||||
}
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("Intel")
|
||||
createProject("NVIDIA")
|
||||
createProject("Apple")
|
@ -1,171 +0,0 @@
|
||||
|
||||
#define TILE_DIM 32
|
||||
#define BLOCK_ROWS 8
|
||||
|
||||
|
||||
/*// simple copy kernel (CUDA)
|
||||
// Used as reference case representing best effective bandwidth.
|
||||
__global__ void copy(float *odata, const float *idata)
|
||||
{
|
||||
int x = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
int y = blockIdx.y * TILE_DIM + threadIdx.y;
|
||||
int width = gridDim.x * TILE_DIM;
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
|
||||
odata[(y+j)*width + x] = idata[(y+j)*width + x];
|
||||
}
|
||||
*/
|
||||
// simple copy kernel (OpenCL)
|
||||
__kernel void copyKernel(__global float* odata, __global const float* idata)
|
||||
{
|
||||
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
|
||||
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
|
||||
int width = get_num_groups(0) * get_local_size(0);
|
||||
for (int j = 0; j < get_num_groups(1); j+= get_local_size(1))
|
||||
{
|
||||
odata[(y+j)*width + x] = idata[(y+j)*width + x];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
// copy kernel using shared memory (CUDA)
|
||||
// Also used as reference case, demonstrating effect of using shared memory.
|
||||
__global__ void copySharedMem(float *odata, const float *idata)
|
||||
{
|
||||
__shared__ float tile[TILE_DIM * TILE_DIM];
|
||||
|
||||
int x = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
int y = blockIdx.y * TILE_DIM + threadIdx.y;
|
||||
int width = gridDim.x * TILE_DIM;
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x] = idata[(y+j)*width + x];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
odata[(y+j)*width + x] = tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x];
|
||||
}
|
||||
*/
|
||||
|
||||
// copy kernel using shared memory (OpenCL)
|
||||
// Also used as reference case, demonstrating effect of using shared memory.
|
||||
__kernel void copySharedMemKernel(__global float *odata, __global const float *idata)
|
||||
{
|
||||
__local float tile[TILE_DIM * TILE_DIM];
|
||||
|
||||
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
|
||||
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
|
||||
int width = get_num_groups(0) * get_local_size(0);
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
tile[(get_local_id(1)+j)*TILE_DIM + get_local_id(0)] = idata[(y+j)*width + x];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
odata[(y+j)*width + x] = tile[(get_local_id(1)+j)*TILE_DIM + get_local_id(0)];
|
||||
}
|
||||
|
||||
/*
|
||||
// naive transpose (CUDA)
|
||||
// Simplest transpose; doesn't use shared memory.
|
||||
// Global memory reads are coalesced but writes are not.
|
||||
__global__ void transposeNaive(float *odata, const float *idata)
|
||||
{
|
||||
int x = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
int y = blockIdx.y * TILE_DIM + threadIdx.y;
|
||||
int width = gridDim.x * TILE_DIM;
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
|
||||
odata[x*width + (y+j)] = idata[(y+j)*width + x];
|
||||
}
|
||||
*/
|
||||
|
||||
// naive transpose (OpenCL)
|
||||
// Simplest transpose; doesn't use shared memory.
|
||||
// Global memory reads are coalesced but writes are not.
|
||||
__kernel void transposeNaiveKernel(__global float *odata, __global const float *idata)
|
||||
{
|
||||
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
|
||||
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
|
||||
int width = get_num_groups(0) * get_local_size(0);
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
|
||||
odata[x*width + (y+j)] = idata[(y+j)*width + x];
|
||||
}
|
||||
|
||||
/*
|
||||
// coalesced transpose (CUDA)
|
||||
// Uses shared memory to achieve coalesing in both reads and writes
|
||||
// Tile width == #banks causes shared memory bank conflicts.
|
||||
__global__ void transposeCoalesced(float *odata, const float *idata)
|
||||
{
|
||||
__shared__ float tile[TILE_DIM][TILE_DIM];
|
||||
|
||||
int x = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
int y = blockIdx.y * TILE_DIM + threadIdx.y;
|
||||
int width = gridDim.x * TILE_DIM;
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset
|
||||
y = blockIdx.x * TILE_DIM + threadIdx.y;
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
|
||||
}
|
||||
*/
|
||||
|
||||
// coalesced transpose (OpenCL)
|
||||
// Uses shared memory to achieve coalesing in both reads and writes
|
||||
// Tile width == #banks causes shared memory bank conflicts.
|
||||
__kernel void transposeCoalescedKernel(__global float *odata, __global const float *idata)
|
||||
{
|
||||
__local float tile[TILE_DIM][TILE_DIM];
|
||||
|
||||
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
|
||||
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
|
||||
int width = get_num_groups(0) * get_local_size(0);
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
tile[get_local_id(1)+j][get_local_id(0)] = idata[(y+j)*width + x];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
x = get_group_id(1) * TILE_DIM + get_local_id(0);
|
||||
y = get_group_id(0) * TILE_DIM + get_local_id(1);
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
odata[(y+j)*width + x] = tile[get_local_id(0)][get_local_id(1) + j];
|
||||
}
|
||||
|
||||
|
||||
// No bank-conflict transpose (OpenCL)
|
||||
// Same as transposeCoalesced except the first tile dimension is padded
|
||||
// to avoid shared memory bank conflicts.
|
||||
__kernel void transposeNoBankConflictsKernel(__global float *odata, __global const float *idata)
|
||||
{
|
||||
__local float tile[TILE_DIM][TILE_DIM+1];
|
||||
|
||||
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
|
||||
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
|
||||
int width = get_num_groups(0) * get_local_size(0);
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
tile[get_local_id(1)+j][get_local_id(0)] = idata[(y+j)*width + x];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
x = get_group_id(1) * TILE_DIM + get_local_id(0);
|
||||
y = get_group_id(0) * TILE_DIM + get_local_id(1);
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
odata[(y+j)*width + x] = tile[get_local_id(0)][get_local_id(1) + j];
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,361 +0,0 @@
|
||||
//Adapted from CUDA to OpenCL by Erwin Coumans
|
||||
//See http://bitbucket.org/erwincoumans/opencl_course
|
||||
|
||||
// Copyright 2012 NVIDIA Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "b3OpenCLUtils.h"
|
||||
#include "../parallel_primitives/host/b3OpenCLArray.h"
|
||||
#include "../parallel_primitives/host/b3LauncherCL.h"
|
||||
#include "Bullet3Common/b3Quickprof.h"
|
||||
#include "../parallel_primitives/host/b3FillCL.h"
|
||||
#include "Bullet3Common/b3CommandLineArgs.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
|
||||
//make sure to update the same #define in the opencl/lds_bank_conflict/lds_kernels.cl
|
||||
const int TILE_DIM = 32;
|
||||
const int BLOCK_ROWS = 8;
|
||||
const int NUM_REPS = 100;
|
||||
|
||||
// Check errors and print GB/s
|
||||
void postprocess(const float *ref, const float *res, int n, float ms)
|
||||
{
|
||||
bool passed = true;
|
||||
for (int i = 0; i < n; i++)
|
||||
if (res[i] != ref[i]) {
|
||||
printf("\nError: at res[%d] got %f but expected %f\n", i, res[i], ref[i]);
|
||||
printf("%25s\n", "*** FAILED ***");
|
||||
passed = false;
|
||||
break;
|
||||
}
|
||||
if (passed)
|
||||
printf("%20.2f\n", 2 * n * sizeof(float) * 1e-6 * NUM_REPS / ms );
|
||||
}
|
||||
|
||||
char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
|
||||
{
|
||||
// locals
|
||||
FILE* pFileStream = NULL;
|
||||
size_t szSourceLength;
|
||||
|
||||
// open the OpenCL source code file
|
||||
pFileStream = fopen(cFilename, "rb");
|
||||
if(pFileStream == 0)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
size_t szPreambleLength = strlen(cPreamble);
|
||||
|
||||
// get the length of the source code
|
||||
fseek(pFileStream, 0, SEEK_END);
|
||||
szSourceLength = ftell(pFileStream);
|
||||
fseek(pFileStream, 0, SEEK_SET);
|
||||
|
||||
// allocate a buffer for the source code string and read it in
|
||||
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
|
||||
memcpy(cSourceString, cPreamble, szPreambleLength);
|
||||
fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream);
|
||||
|
||||
// close the file and return the total length of the combined (preamble + source) string
|
||||
fclose(pFileStream);
|
||||
if(szFinalLength != 0)
|
||||
{
|
||||
*szFinalLength = szSourceLength + szPreambleLength;
|
||||
}
|
||||
cSourceString[szSourceLength + szPreambleLength] = '\0';
|
||||
|
||||
return cSourceString;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
printf("Use --deviceId=<id> or --platformId=<id> to override OpenCL device\n");
|
||||
b3CommandLineArgs args(argc,argv);
|
||||
|
||||
const int nx = 1024;
|
||||
const int ny = 1024;
|
||||
|
||||
const int mem_size = nx*ny*sizeof(float);
|
||||
const int num_elements = nx*ny;
|
||||
b3Clock clock;
|
||||
double startEvent=0.f;
|
||||
double stopEvent=0.f;
|
||||
|
||||
int localSizeX = TILE_DIM;
|
||||
int localSizeY = BLOCK_ROWS;
|
||||
|
||||
int numThreadsX = (nx/TILE_DIM)*TILE_DIM;
|
||||
int numThreadsY = (ny/TILE_DIM)*BLOCK_ROWS;
|
||||
|
||||
int gridX = numThreadsX / localSizeX;
|
||||
int gridY = numThreadsY / localSizeY;
|
||||
|
||||
int ciErrNum = 0;
|
||||
int preferred_device = -1;
|
||||
int preferred_platform = -1;
|
||||
args.GetCmdLineArgument("deviceId",preferred_device);
|
||||
args.GetCmdLineArgument("platformId",preferred_platform);
|
||||
|
||||
|
||||
cl_platform_id platformId=0;
|
||||
cl_context ctx=0;
|
||||
cl_command_queue queue=0;
|
||||
cl_device_id device=0;
|
||||
cl_kernel copyKernel=0;
|
||||
cl_kernel copySharedMemKernel=0;
|
||||
cl_kernel transposeNaiveKernel = 0;
|
||||
cl_kernel transposeCoalescedKernel = 0;
|
||||
cl_kernel transposeNoBankConflictsKernel= 0;
|
||||
|
||||
|
||||
ctx = b3OpenCLUtils::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
|
||||
b3OpenCLUtils::printPlatformInfo(platformId);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
device = b3OpenCLUtils::getDevice(ctx,0);
|
||||
b3OpenCLUtils::printDeviceInfo(device);
|
||||
queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
|
||||
|
||||
const char* cSourceFile = "opencl/lds_bank_conflict/lds_kernels.cl";
|
||||
|
||||
size_t szKernelLength;
|
||||
|
||||
const char* cSourceCL =0;
|
||||
char relativeFileName[1024];
|
||||
|
||||
{
|
||||
const char* prefix[]={"./","../","../../","../../../","../../../../"};
|
||||
int numPrefixes = sizeof(prefix)/sizeof(char*);
|
||||
|
||||
for (int i=0;!cSourceCL && i<numPrefixes;i++)
|
||||
{
|
||||
|
||||
sprintf(relativeFileName,"%s%s",prefix[i],cSourceFile);
|
||||
cSourceCL = loadProgSource(relativeFileName, "", &szKernelLength);
|
||||
if (cSourceCL)
|
||||
{
|
||||
printf("Loaded program source: %s\n", relativeFileName);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!cSourceCL)
|
||||
{
|
||||
printf("Couldn't find file %s, exiting\n",cSourceFile);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
char flags[1024]={0};
|
||||
#ifdef CL_PLATFORM_INTEL
|
||||
///use this flag to allow for OpenCL kernel debugging on CPU using the Intel OpenCL run-time
|
||||
//sprintf(flags,"-g -s \"%s\"","C:/develop/opencl_course/opencl/lds_bank_conflict/lds_kernels.cl");
|
||||
#endif//CL_PLATFORM_INTEL
|
||||
|
||||
|
||||
copyKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"copyKernel",&ciErrNum,0,flags);
|
||||
copySharedMemKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"copySharedMemKernel",&ciErrNum,0,flags);
|
||||
transposeNaiveKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeNaiveKernel",&ciErrNum,0,flags);
|
||||
transposeCoalescedKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeCoalescedKernel",&ciErrNum,0,flags);
|
||||
transposeNoBankConflictsKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeNoBankConflictsKernel",&ciErrNum,0,flags);
|
||||
|
||||
b3FillCL clMemSet(ctx,device,queue);
|
||||
|
||||
printf("\n============================================\n");
|
||||
|
||||
printf("Matrix size: %d %d, Block size: %d %d, Tile size: %d %d\n",
|
||||
nx, ny, TILE_DIM, BLOCK_ROWS, TILE_DIM, TILE_DIM);
|
||||
|
||||
float *h_idata = (float*)malloc(mem_size);
|
||||
float *h_cdata = (float*)malloc(mem_size);
|
||||
float *h_tdata = (float*)malloc(mem_size);
|
||||
float *gold = (float*)malloc(mem_size);
|
||||
|
||||
b3OpenCLArray<float> d_idataCL(ctx,queue);d_idataCL.resize(num_elements);
|
||||
b3OpenCLArray<float> d_cdataCL(ctx,queue);d_cdataCL.resize(num_elements);
|
||||
b3OpenCLArray<float> d_tdataCL(ctx,queue);d_tdataCL.resize(num_elements);
|
||||
|
||||
|
||||
// check parameters and calculate execution configuration
|
||||
if (nx % TILE_DIM || ny % TILE_DIM)
|
||||
{
|
||||
printf("nx and ny must be a multiple of TILE_DIM\n");
|
||||
goto error_exit;
|
||||
}
|
||||
|
||||
if (TILE_DIM % BLOCK_ROWS)
|
||||
{
|
||||
printf("TILE_DIM must be a multiple of BLOCK_ROWS\n");
|
||||
goto error_exit;
|
||||
}
|
||||
|
||||
// host
|
||||
for (int j = 0; j < ny; j++)
|
||||
for (int i = 0; i < nx; i++)
|
||||
h_idata[j*nx + i] = j*nx + i;
|
||||
|
||||
// correct result for error checking
|
||||
for (int j = 0; j < ny; j++)
|
||||
for (int i = 0; i < nx; i++)
|
||||
{
|
||||
gold[j*nx + i] = h_idata[i*nx + j];
|
||||
}
|
||||
|
||||
d_idataCL.copyFromHostPointer(h_idata,num_elements);
|
||||
|
||||
// events for timing
|
||||
clock.reset();
|
||||
|
||||
float ms;
|
||||
|
||||
// ------------
|
||||
// time kernels
|
||||
// ------------
|
||||
printf("%25s%25s\n", "Routine", "Bandwidth (GB/s)");
|
||||
|
||||
// ----
|
||||
// copy
|
||||
// ----
|
||||
printf("%25s", "copy");
|
||||
|
||||
clMemSet.execute(d_cdataCL,0.f,num_elements);
|
||||
|
||||
{
|
||||
// warm up
|
||||
b3LauncherCL launcher( queue, copyKernel);
|
||||
launcher.setBuffer( d_cdataCL.getBufferCL());
|
||||
launcher.setBuffer( d_idataCL.getBufferCL());
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
|
||||
startEvent = clock.getTimeMicroseconds()/1e3;
|
||||
for (int i = 0; i < NUM_REPS; i++)
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
clFinish(queue);
|
||||
stopEvent = clock.getTimeMicroseconds()/1e3;
|
||||
}
|
||||
|
||||
ms = float(stopEvent-startEvent);
|
||||
|
||||
d_cdataCL.copyToHostPointer(h_cdata,num_elements,0);
|
||||
postprocess(h_idata, h_cdata, nx*ny, ms);
|
||||
|
||||
// -------------
|
||||
// copySharedMem
|
||||
// -------------
|
||||
printf("%25s", "shared memory copy");
|
||||
clMemSet.execute(d_cdataCL,0.f,num_elements);
|
||||
|
||||
{
|
||||
b3LauncherCL launcher( queue, copySharedMemKernel);
|
||||
launcher.setBuffer( d_cdataCL.getBufferCL());
|
||||
launcher.setBuffer( d_idataCL.getBufferCL());
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
|
||||
startEvent = clock.getTimeMicroseconds()/1e3;
|
||||
for (int i = 0; i < NUM_REPS; i++)
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
clFinish(queue);
|
||||
stopEvent = clock.getTimeMicroseconds()/1e3;
|
||||
}
|
||||
|
||||
ms = float(stopEvent-startEvent);
|
||||
d_cdataCL.copyToHostPointer(h_cdata,num_elements,0);
|
||||
postprocess(h_idata, h_cdata, nx * ny, ms);
|
||||
|
||||
// --------------
|
||||
// transposeNaive
|
||||
// --------------
|
||||
printf("%25s", "naive transpose");
|
||||
clMemSet.execute(d_tdataCL,0.f,num_elements);
|
||||
{
|
||||
// warmup
|
||||
b3LauncherCL launcher( queue, transposeNaiveKernel);
|
||||
launcher.setBuffer( d_tdataCL.getBufferCL());
|
||||
launcher.setBuffer( d_idataCL.getBufferCL());
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
|
||||
startEvent = clock.getTimeMicroseconds()/1e3;
|
||||
for (int i = 0; i < NUM_REPS; i++)
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
clFinish(queue);
|
||||
stopEvent = clock.getTimeMicroseconds()/1e3;
|
||||
}
|
||||
ms = float(stopEvent-startEvent);
|
||||
d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
|
||||
postprocess(gold, h_tdata, nx * ny, ms);
|
||||
|
||||
// ------------------
|
||||
// transposeCoalesced
|
||||
// ------------------
|
||||
printf("%25s", "coalesced transpose");
|
||||
clMemSet.execute(d_tdataCL,0.f,num_elements);
|
||||
{
|
||||
b3LauncherCL launcher( queue, transposeCoalescedKernel);
|
||||
launcher.setBuffer( d_tdataCL.getBufferCL());
|
||||
launcher.setBuffer( d_idataCL.getBufferCL());
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
|
||||
startEvent = clock.getTimeMicroseconds()/1e3;
|
||||
for (int i = 0; i < NUM_REPS; i++)
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
clFinish(queue);
|
||||
stopEvent = clock.getTimeMicroseconds()/1e3;
|
||||
}
|
||||
|
||||
ms = float(stopEvent-startEvent);
|
||||
d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
|
||||
postprocess(gold, h_tdata, nx * ny, ms);
|
||||
|
||||
// ------------------------
|
||||
// transposeNoBankConflicts
|
||||
// ------------------------
|
||||
printf("%25s", "conflict-free transpose");
|
||||
clMemSet.execute(d_tdataCL,0.f,num_elements);
|
||||
{
|
||||
b3LauncherCL launcher( queue, transposeNoBankConflictsKernel);
|
||||
launcher.setBuffer( d_tdataCL.getBufferCL());
|
||||
launcher.setBuffer( d_idataCL.getBufferCL());
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
|
||||
startEvent = clock.getTimeMicroseconds()/1e3;
|
||||
for (int i = 0; i < NUM_REPS; i++)
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
clFinish(queue);
|
||||
stopEvent = clock.getTimeMicroseconds()/1e3;
|
||||
}
|
||||
|
||||
ms = float(stopEvent-startEvent);
|
||||
d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
|
||||
postprocess(gold, h_tdata, nx * ny, ms);
|
||||
|
||||
error_exit:
|
||||
// cleanup
|
||||
clReleaseKernel(copyKernel);
|
||||
clReleaseCommandQueue(queue);
|
||||
clReleaseContext(ctx);
|
||||
|
||||
free(h_idata);
|
||||
free(h_tdata);
|
||||
free(h_cdata);
|
||||
free(gold);
|
||||
printf("Press <enter>\n");
|
||||
getchar();
|
||||
}
|
@ -1,44 +0,0 @@
|
||||
|
||||
function createProject (vendor)
|
||||
|
||||
local hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ( "OpenCL_lds_bank_conflict_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../bin"
|
||||
|
||||
links {
|
||||
"OpenCL_lib_parallel_primitives_host_" .. vendor
|
||||
}
|
||||
|
||||
includedirs {
|
||||
"../basic_initialize",
|
||||
"../../src"
|
||||
}
|
||||
|
||||
files {
|
||||
"main.cpp",
|
||||
"../basic_initialize/b3OpenCLUtils.cpp",
|
||||
"../basic_initialize/b3OpenCLUtils.h",
|
||||
"../../src/Bullet3Common/b3AlignedAllocator.cpp",
|
||||
"../../src/Bullet3Common/b3AlignedAllocator.h",
|
||||
"../../src/Bullet3Common/b3AlignedObjectArray.h",
|
||||
"../../src/Bullet3Common/b3Quickprof.cpp",
|
||||
"../../src/Bullet3Common/b3Quickprof.h",
|
||||
|
||||
}
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("NVIDIA")
|
||||
createProject("Intel")
|
||||
createProject("Apple")
|
@ -1,40 +0,0 @@
|
||||
function createProject(vendor)
|
||||
hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ("OpenCL_radixsort_benchmark_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../../bin"
|
||||
includedirs {"..","../../../src"}
|
||||
|
||||
links {
|
||||
("OpenCL_lib_parallel_primitives_host_" .. vendor)
|
||||
}
|
||||
|
||||
files {
|
||||
"test_large_problem_sorting.cpp",
|
||||
"../../basic_initialize/b3OpenCLUtils.cpp",
|
||||
"../../basic_initialize/b3OpenCLUtils.h",
|
||||
"../host/b3FillCL.cpp",
|
||||
"../host/b3PrefixScanCL.cpp",
|
||||
"../host/b3RadixSort32CL.cpp",
|
||||
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
|
||||
"../../../src/Bullet3Common/b3AlignedAllocator.h",
|
||||
"../../../src/Bullet3Common/b3AlignedObjectArray.h",
|
||||
"../../../src/Bullet3Common/b3Quickprof.cpp",
|
||||
"../../../src/Bullet3Common/b3Quickprof.h",
|
||||
}
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("Intel")
|
||||
createProject("NVIDIA")
|
||||
createProject("Apple")
|
@ -1,711 +0,0 @@
|
||||
/******************************************************************************
|
||||
* Copyright 2010 Duane Merrill
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
* AUTHORS' REQUEST:
|
||||
*
|
||||
* If you use|reference|benchmark this code, please cite our Technical
|
||||
* Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
|
||||
*
|
||||
* @TechReport{ Merrill:Sorting:2010,
|
||||
* author = "Duane Merrill and Andrew Grimshaw",
|
||||
* title = "Revisiting Sorting for GPGPU Stream Architectures",
|
||||
* year = "2010",
|
||||
* institution = "University of Virginia, Department of Computer Science",
|
||||
* address = "Charlottesville, VA, USA",
|
||||
* number = "CS2010-03"
|
||||
* }
|
||||
*
|
||||
* For more information, see our Google Code project site:
|
||||
* http://code.google.com/p/back40computing/
|
||||
*
|
||||
* Thanks!
|
||||
******************************************************************************/
|
||||
|
||||
/******************************************************************************
|
||||
* Simple test driver program for *large-problem* radix sorting.
|
||||
*
|
||||
* Useful for demonstrating how to integrate radix sorting into
|
||||
* your application
|
||||
******************************************************************************/
|
||||
|
||||
/******************************************************************************
|
||||
* Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans
|
||||
******************************************************************************/
|
||||
#ifdef _WIN32
|
||||
#pragma warning (disable:4996)
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
|
||||
|
||||
//#include <iostream>
|
||||
#include <sstream>
|
||||
/**********************
|
||||
*
|
||||
*/
|
||||
|
||||
#include "../host/b3RadixSort32CL.h"
|
||||
#include "../../basic_initialize/b3OpenCLUtils.h"
|
||||
#include "Bullet3Common/b3Quickprof.h"
|
||||
|
||||
cl_context g_cxMainContext;
|
||||
cl_device_id g_device;
|
||||
cl_command_queue g_cqCommandQueue;
|
||||
|
||||
/***********************
|
||||
*
|
||||
*/
|
||||
|
||||
bool g_verbose;
|
||||
///Preferred OpenCL device/platform. When < 0 then no preference is used.
|
||||
///Note that b3OpenCLUtils might still use the preference of using a platform vendor that matches the SDK vendor used to build the application.
|
||||
///Preferred device/platform take priority over this platform-vendor match
|
||||
int gPreferredDeviceId = -1;
|
||||
int gPreferredPlatformId = -1;
|
||||
|
||||
|
||||
|
||||
/******************************************************************************
|
||||
* Routines
|
||||
******************************************************************************/
|
||||
|
||||
|
||||
/**
|
||||
* Keys-only sorting. Uses the GPU to sort the specified vector of elements for the given
|
||||
* number of iterations, displaying runtime information.
|
||||
*
|
||||
* @param[in] num_elements
|
||||
* Size in elements of the vector to sort
|
||||
* @param[in] h_keys
|
||||
* Vector of keys to sort
|
||||
* @param[in] iterations
|
||||
* Number of times to invoke the GPU sorting primitive
|
||||
* @param[in] cfg
|
||||
* Config
|
||||
*/
|
||||
template <typename K>
|
||||
void TimedSort(
|
||||
unsigned int num_elements,
|
||||
K *h_keys,
|
||||
unsigned int iterations)
|
||||
{
|
||||
printf("Keys only, %d iterations, %d elements\n", iterations, num_elements);
|
||||
|
||||
int max_elements = num_elements;
|
||||
b3AlignedObjectArray<unsigned int> hostData;
|
||||
hostData.resize(num_elements);
|
||||
for (int i=0;i<num_elements;i++)
|
||||
{
|
||||
hostData[i] = h_keys[i];
|
||||
}
|
||||
|
||||
b3RadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
|
||||
|
||||
b3OpenCLArray<unsigned int> gpuData(g_cxMainContext,g_cqCommandQueue);
|
||||
gpuData.copyFromHost(hostData);
|
||||
//sorter.executeHost(gpuData);
|
||||
sorter.execute(gpuData);
|
||||
|
||||
b3AlignedObjectArray<unsigned int> hostDataSorted;
|
||||
gpuData.copyToHost(hostDataSorted);
|
||||
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
{
|
||||
//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
|
||||
|
||||
// Create sorting enactor
|
||||
|
||||
// Perform the timed number of sorting iterations
|
||||
double elapsed = 0;
|
||||
float duration = 0;
|
||||
b3Clock watch;
|
||||
|
||||
//warm-start
|
||||
gpuData.copyFromHost(hostData);
|
||||
clFinish(g_cqCommandQueue);
|
||||
sorter.execute(gpuData);
|
||||
|
||||
watch.reset();
|
||||
|
||||
|
||||
for (int i = 0; i < iterations; i++)
|
||||
{
|
||||
|
||||
|
||||
|
||||
// Move a fresh copy of the problem into device storage
|
||||
gpuData.copyFromHost(hostData);
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
// Start GPU timing record
|
||||
double startMs = watch.getTimeMicroseconds()/1e3;
|
||||
|
||||
// Call the sorting API routine
|
||||
sorter.execute(gpuData);
|
||||
|
||||
|
||||
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
double stopMs = watch.getTimeMicroseconds()/1e3;
|
||||
|
||||
duration = stopMs - startMs;
|
||||
|
||||
// End GPU timing record
|
||||
elapsed += (double) duration;
|
||||
printf("duration = %f\n", duration);
|
||||
}
|
||||
|
||||
// Display timing information
|
||||
double avg_runtime = elapsed / iterations;
|
||||
// double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
|
||||
// printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
|
||||
double throughput = ((double) num_elements) / avg_runtime / 1000.0 ;
|
||||
printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
|
||||
|
||||
gpuData.copyToHost(hostData);
|
||||
for (int i=0;i<num_elements;i++)
|
||||
{
|
||||
h_keys[i] = hostData[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Key-value sorting. Uses the GPU to sort the specified vector of elements for the given
|
||||
* number of iterations, displaying runtime information.
|
||||
*
|
||||
* @param[in] num_elements
|
||||
* Size in elements of the vector to sort
|
||||
* @param[in] h_keys
|
||||
* Vector of keys to sort
|
||||
* @param[in,out] h_values
|
||||
* Vector of values to sort
|
||||
* @param[in] iterations
|
||||
* Number of times to invoke the GPU sorting primitive
|
||||
* @param[in] cfg
|
||||
* Config
|
||||
*/
|
||||
template <typename K, typename V>
|
||||
void TimedSort(
|
||||
unsigned int num_elements,
|
||||
K *h_keys,
|
||||
V *h_values,
|
||||
unsigned int iterations)
|
||||
{
|
||||
|
||||
printf("Key-values, %d iterations, %d elements\n", iterations, num_elements);
|
||||
|
||||
int max_elements = num_elements;
|
||||
b3AlignedObjectArray<b3SortData> hostData;
|
||||
hostData.resize(num_elements);
|
||||
for (int i=0;i<num_elements;i++)
|
||||
{
|
||||
hostData[i].m_key = h_keys[i];
|
||||
hostData[i].m_value = h_values[i];
|
||||
}
|
||||
|
||||
b3RadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
|
||||
|
||||
b3OpenCLArray<b3SortData> gpuData(g_cxMainContext,g_cqCommandQueue);
|
||||
gpuData.copyFromHost(hostData);
|
||||
//sorter.executeHost(gpuData);
|
||||
sorter.execute(gpuData);
|
||||
|
||||
b3AlignedObjectArray<b3SortData> hostDataSorted;
|
||||
gpuData.copyToHost(hostDataSorted);
|
||||
#if 0
|
||||
for (int i=0;i<num_elements;i++)
|
||||
{
|
||||
printf("hostData[%d].m_key = %d\n",i, hostDataSorted[i].m_key);
|
||||
printf("hostData[%d].m_value = %d\n",i,hostDataSorted[i].m_value);
|
||||
}
|
||||
#endif
|
||||
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
{
|
||||
//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
|
||||
|
||||
// Create sorting enactor
|
||||
|
||||
// Perform the timed number of sorting iterations
|
||||
double elapsed = 0;
|
||||
float duration = 0;
|
||||
b3Clock watch;
|
||||
|
||||
//warm-start
|
||||
gpuData.copyFromHost(hostData);
|
||||
sorter.execute(gpuData);
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
watch.reset();
|
||||
|
||||
|
||||
for (int i = 0; i < iterations; i++)
|
||||
{
|
||||
|
||||
|
||||
|
||||
// Move a fresh copy of the problem into device storage
|
||||
gpuData.copyFromHost(hostData);
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
// Start GPU timing record
|
||||
double startMs = watch.getTimeMicroseconds()/1e3;
|
||||
|
||||
// Call the sorting API routine
|
||||
sorter.execute(gpuData);
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
double stopMs = watch.getTimeMicroseconds()/1e3;
|
||||
|
||||
duration = stopMs - startMs;
|
||||
|
||||
// End GPU timing record
|
||||
elapsed += (double) duration;
|
||||
printf("duration = %f\n", duration);
|
||||
}
|
||||
|
||||
// Display timing information
|
||||
double avg_runtime = elapsed / iterations;
|
||||
// double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
|
||||
// printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
|
||||
double throughput = ((double) num_elements) / avg_runtime / 1000.0 ;
|
||||
printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
|
||||
|
||||
gpuData.copyToHost(hostData);
|
||||
for (int i=0;i<num_elements;i++)
|
||||
{
|
||||
h_keys[i] = hostData[i].m_key;
|
||||
h_values[i] = hostData[i].m_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Generates random 32-bit keys.
|
||||
*
|
||||
* We always take the second-order byte from rand() because the higher-order
|
||||
* bits returned by rand() are commonly considered more uniformly distributed
|
||||
* than the lower-order bits.
|
||||
*
|
||||
* We can decrease the entropy level of keys by adopting the technique
|
||||
* of Thearling and Smith in which keys are computed from the bitwise AND of
|
||||
* multiple random samples:
|
||||
*
|
||||
* entropy_reduction | Effectively-unique bits per key
|
||||
* -----------------------------------------------------
|
||||
* -1 | 0
|
||||
* 0 | 32
|
||||
* 1 | 25.95
|
||||
* 2 | 17.41
|
||||
* 3 | 10.78
|
||||
* 4 | 6.42
|
||||
* ... | ...
|
||||
*
|
||||
*/
|
||||
template <typename K>
|
||||
void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8)
|
||||
{
|
||||
const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char);
|
||||
unsigned char key_bits[NUM_UCHARS];
|
||||
|
||||
do {
|
||||
|
||||
for (int j = 0; j < NUM_UCHARS; j++) {
|
||||
unsigned char quarterword = 0xff;
|
||||
for (int i = 0; i <= entropy_reduction; i++) {
|
||||
quarterword &= (rand() >> 7);
|
||||
}
|
||||
key_bits[j] = quarterword;
|
||||
}
|
||||
|
||||
if (lower_key_bits < sizeof(K) * 8) {
|
||||
unsigned long long base = 0;
|
||||
memcpy(&base, key_bits, sizeof(K));
|
||||
base &= (1 << lower_key_bits) - 1;
|
||||
memcpy(key_bits, &base, sizeof(K));
|
||||
}
|
||||
|
||||
memcpy(&key, key_bits, sizeof(K));
|
||||
|
||||
} while (key != key); // avoids NaNs when generating random floating point numbers
|
||||
}
|
||||
|
||||
|
||||
/******************************************************************************
|
||||
* Templated routines for printing keys/values to the console
|
||||
******************************************************************************/
|
||||
|
||||
template<typename T>
|
||||
void PrintValue(T val) {
|
||||
printf("%d", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<float>(float val) {
|
||||
printf("%f", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<double>(double val) {
|
||||
printf("%f", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned char>(unsigned char val) {
|
||||
printf("%u", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned short>(unsigned short val) {
|
||||
printf("%u", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned int>(unsigned int val) {
|
||||
printf("%u", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<long>(long val) {
|
||||
printf("%ld", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned long>(unsigned long val) {
|
||||
printf("%lu", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<long long>(long long val) {
|
||||
printf("%lld", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned long long>(unsigned long long val) {
|
||||
printf("%llu", val);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Compares the equivalence of two arrays
|
||||
*/
|
||||
template <typename T, typename SizeT>
|
||||
int CompareResults(T* computed, T* reference, SizeT len, bool verbose = true)
|
||||
{
|
||||
printf("\n");
|
||||
for (SizeT i = 0; i < len; i++) {
|
||||
|
||||
if (computed[i] != reference[i]) {
|
||||
printf("INCORRECT: [%lu]: ", (unsigned long) i);
|
||||
PrintValue<T>(computed[i]);
|
||||
printf(" != ");
|
||||
PrintValue<T>(reference[i]);
|
||||
|
||||
if (verbose) {
|
||||
printf("\nresult[...");
|
||||
for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
|
||||
PrintValue<T>(computed[j]);
|
||||
printf(", ");
|
||||
}
|
||||
printf("...]");
|
||||
printf("\nreference[...");
|
||||
for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
|
||||
PrintValue<T>(reference[j]);
|
||||
printf(", ");
|
||||
}
|
||||
printf("...]");
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("CORRECT\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an example sorting problem whose keys is a vector of the specified
|
||||
* number of K elements, values of V elements, and then dispatches the problem
|
||||
* to the GPU for the given number of iterations, displaying runtime information.
|
||||
*
|
||||
* @param[in] iterations
|
||||
* Number of times to invoke the GPU sorting primitive
|
||||
* @param[in] num_elements
|
||||
* Size in elements of the vector to sort
|
||||
* @param[in] cfg
|
||||
* Config
|
||||
*/
|
||||
template<typename K, typename V>
|
||||
void TestSort(
|
||||
unsigned int iterations,
|
||||
int num_elements,
|
||||
bool keys_only)
|
||||
{
|
||||
// Allocate the sorting problem on the host and fill the keys with random bytes
|
||||
|
||||
K *h_keys = NULL;
|
||||
K *h_reference_keys = NULL;
|
||||
V *h_values = NULL;
|
||||
h_keys = (K*) malloc(num_elements * sizeof(K));
|
||||
h_reference_keys = (K*) malloc(num_elements * sizeof(K));
|
||||
if (!keys_only) h_values = (V*) malloc(num_elements * sizeof(V));
|
||||
|
||||
|
||||
// Use random bits
|
||||
for (unsigned int i = 0; i < num_elements; ++i) {
|
||||
RandomBits<K>(h_keys[i], 0);
|
||||
//h_keys[i] = num_elements-i;
|
||||
//h_keys[i] = 0xffffffffu-i;
|
||||
if (!keys_only)
|
||||
h_values[i] = h_keys[i];//0xffffffffu-i;
|
||||
|
||||
h_reference_keys[i] = h_keys[i];
|
||||
}
|
||||
|
||||
// Run the timing test
|
||||
if (keys_only) {
|
||||
TimedSort<K>(num_elements, h_keys, iterations);
|
||||
} else {
|
||||
TimedSort<K, V>(num_elements, h_keys, h_values, iterations);
|
||||
}
|
||||
|
||||
// cudaThreadSynchronize();
|
||||
|
||||
// Display sorted key data
|
||||
if (g_verbose) {
|
||||
printf("\n\nKeys:\n");
|
||||
for (int i = 0; i < num_elements; i++) {
|
||||
PrintValue<K>(h_keys[i]);
|
||||
printf(", ");
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
// Verify solution
|
||||
std::sort(h_reference_keys, h_reference_keys + num_elements);
|
||||
CompareResults<K>(h_keys, h_reference_keys, num_elements, true);
|
||||
printf("\n");
|
||||
fflush(stdout);
|
||||
|
||||
// Free our allocated host memory
|
||||
if (h_keys != NULL) free(h_keys);
|
||||
if (h_values != NULL) free(h_values);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Displays the commandline usage for this tool
|
||||
*/
|
||||
void Usage()
|
||||
{
|
||||
printf("\ntest_large_problem_sorting [--device=<device index>] [--v] [--i=<num-iterations>] [--n=<num-elements>] [--key-values] [--deviceId=<int>] [--platformId=<int>]\n");
|
||||
printf("\n");
|
||||
printf("\t--v\tDisplays sorted results to the console.\n");
|
||||
printf("\n");
|
||||
printf("\t--i\tPerforms the sorting operation <num-iterations> times\n");
|
||||
printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n");
|
||||
printf("\n");
|
||||
printf("\t--n\tThe number of elements to comprise the sample problem\n");
|
||||
printf("\t\t\tDefault = 512\n");
|
||||
printf("\n");
|
||||
printf("\t--key-values\tSpecifies that keys are accommodated by value pairings\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
||||
/******************************************************************************
|
||||
* Command-line parsing
|
||||
******************************************************************************/
|
||||
#include <map>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
|
||||
class b3CommandLineArgs
|
||||
{
|
||||
protected:
|
||||
|
||||
std::map<std::string, std::string> pairs;
|
||||
|
||||
public:
|
||||
|
||||
// Constructor
|
||||
b3CommandLineArgs(int argc, char **argv)
|
||||
{
|
||||
using namespace std;
|
||||
|
||||
for (int i = 1; i < argc; i++)
|
||||
{
|
||||
string arg = argv[i];
|
||||
|
||||
if ((arg[0] != '-') || (arg[1] != '-')) {
|
||||
continue;
|
||||
}
|
||||
|
||||
string::size_type pos;
|
||||
string key, val;
|
||||
if ((pos = arg.find( '=')) == string::npos) {
|
||||
key = string(arg, 2, arg.length() - 2);
|
||||
val = "";
|
||||
} else {
|
||||
key = string(arg, 2, pos - 2);
|
||||
val = string(arg, pos + 1, arg.length() - 1);
|
||||
}
|
||||
pairs[key] = val;
|
||||
}
|
||||
}
|
||||
|
||||
bool CheckCmdLineFlag(const char* arg_name)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GetCmdLineArgument(const char *arg_name, T &val);
|
||||
|
||||
int ParsedArgc()
|
||||
{
|
||||
return pairs.size();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void b3CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
istringstream strstream(itr->second);
|
||||
strstream >> val;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void b3CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
|
||||
string s = itr->second;
|
||||
val = (char*) malloc(sizeof(char) * (s.length() + 1));
|
||||
strcpy(val, s.c_str());
|
||||
|
||||
} else {
|
||||
val = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/******************************************************************************
|
||||
* Main
|
||||
******************************************************************************/
|
||||
|
||||
extern bool gDebugSkipLoadingBinary;
|
||||
|
||||
int main( int argc, char** argv)
|
||||
{
|
||||
//gDebugSkipLoadingBinary = true;
|
||||
|
||||
cl_int ciErrNum;
|
||||
b3CommandLineArgs args(argc,argv);
|
||||
|
||||
args.GetCmdLineArgument("deviceId", gPreferredDeviceId);
|
||||
args.GetCmdLineArgument("platformId", gPreferredPlatformId);
|
||||
|
||||
printf("Initialize OpenCL using b3OpenCLUtils_createContextFromType\n");
|
||||
cl_platform_id platformId;
|
||||
g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
|
||||
// g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
|
||||
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
|
||||
int numDev = b3OpenCLUtils_getNumDevices(g_cxMainContext);
|
||||
|
||||
if (!numDev)
|
||||
{
|
||||
printf("error: no OpenCL devices\n");
|
||||
exit(0);
|
||||
}
|
||||
int result;
|
||||
int devId = 0;
|
||||
g_device = b3OpenCLUtils_getDevice(g_cxMainContext,devId);
|
||||
b3OpenCLUtils_printDeviceInfo(g_device);
|
||||
// create a command-queue
|
||||
g_cqCommandQueue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
|
||||
|
||||
|
||||
//srand(time(NULL));
|
||||
srand(0); // presently deterministic
|
||||
|
||||
unsigned int num_elements = 8*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
|
||||
unsigned int iterations = 10;
|
||||
bool keys_only = true;
|
||||
|
||||
//
|
||||
// Check command line arguments
|
||||
//
|
||||
|
||||
|
||||
|
||||
if (args.CheckCmdLineFlag("help"))
|
||||
{
|
||||
Usage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
args.GetCmdLineArgument("i", iterations);
|
||||
args.GetCmdLineArgument("n", num_elements);
|
||||
|
||||
|
||||
|
||||
keys_only = !args.CheckCmdLineFlag("key-values");
|
||||
g_verbose = args.CheckCmdLineFlag("v");
|
||||
|
||||
|
||||
|
||||
TestSort<unsigned int, unsigned int>(
|
||||
iterations,
|
||||
num_elements,
|
||||
keys_only);
|
||||
|
||||
|
||||
}
|
@ -1,35 +0,0 @@
|
||||
#ifndef B3_INT2_H
|
||||
#define B3_INT2_H
|
||||
|
||||
struct b3UnsignedInt2
|
||||
{
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
unsigned int x,y;
|
||||
};
|
||||
struct
|
||||
{
|
||||
unsigned int s[2];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
struct b3Int2
|
||||
{
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
int x,y;
|
||||
};
|
||||
struct
|
||||
{
|
||||
int s[2];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
#endif
|
@ -1,379 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include "../basic_initialize/b3OpenCLUtils.h"
|
||||
#include "../host/b3FillCL.h"
|
||||
#include "../host/b3BoundSearchCL.h"
|
||||
#include "../host/b3RadixSort32CL.h"
|
||||
#include "../host/b3PrefixScanCL.h"
|
||||
#include "Bullet3Common/b3CommandLineArgs.h"
|
||||
#include "Bullet3Common/b3MinMax.h"
|
||||
|
||||
int g_nPassed = 0;
|
||||
int g_nFailed = 0;
|
||||
bool g_testFailed = 0;
|
||||
|
||||
#define TEST_INIT g_testFailed = 0;
|
||||
#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
|
||||
#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
|
||||
#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
|
||||
|
||||
cl_context g_context=0;
|
||||
cl_device_id g_device=0;
|
||||
cl_command_queue g_queue =0;
|
||||
const char* g_deviceName = 0;
|
||||
|
||||
void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
|
||||
{
|
||||
void* glCtx=0;
|
||||
void* glDC = 0;
|
||||
int ciErrNum = 0;
|
||||
//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
|
||||
|
||||
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
|
||||
|
||||
g_context = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
int numDev = b3OpenCLUtils::getNumDevices(g_context);
|
||||
if (numDev>0)
|
||||
{
|
||||
b3OpenCLDeviceInfo info;
|
||||
g_device= b3OpenCLUtils::getDevice(g_context,0);
|
||||
g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
b3OpenCLUtils::printDeviceInfo(g_device);
|
||||
b3OpenCLUtils::getDeviceInfo(g_device,&info);
|
||||
g_deviceName = info.m_deviceName;
|
||||
}
|
||||
}
|
||||
|
||||
void exitCL()
|
||||
{
|
||||
clReleaseCommandQueue(g_queue);
|
||||
clReleaseContext(g_context);
|
||||
}
|
||||
|
||||
|
||||
inline void fillIntTest()
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
b3FillCL* fillCL = new b3FillCL(g_context,g_device,g_queue);
|
||||
int maxSize=1024*256;
|
||||
b3OpenCLArray<int> intBuffer(g_context,g_queue,maxSize);
|
||||
intBuffer.resize(maxSize);
|
||||
|
||||
#define NUM_TESTS 7
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for (int iter=0;iter<NUM_TESTS;iter++)
|
||||
{
|
||||
int size = b3Min( 11+dx*iter, maxSize );
|
||||
|
||||
int value = 2;
|
||||
|
||||
|
||||
int offset=0;
|
||||
fillCL->execute(intBuffer,value,size,offset);
|
||||
|
||||
b3AlignedObjectArray<int> hostBuf2;
|
||||
hostBuf2.resize(size);
|
||||
fillCL->executeHost(hostBuf2,value,size,offset);
|
||||
|
||||
b3AlignedObjectArray<int> hostBuf;
|
||||
intBuffer.copyToHost(hostBuf);
|
||||
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
|
||||
TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
delete fillCL;
|
||||
|
||||
TEST_REPORT( "fillIntTest" );
|
||||
}
|
||||
|
||||
|
||||
__inline
|
||||
void seedRandom(int seed)
|
||||
{
|
||||
srand( seed );
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
T getRandom(const T& minV, const T& maxV)
|
||||
{
|
||||
float r = (rand()%10000)/10000.f;
|
||||
T range = maxV - minV;
|
||||
return (T)(minV + r*range);
|
||||
}
|
||||
|
||||
struct b3SortDataCompare
|
||||
{
|
||||
inline bool operator()(const b3SortData& first, const b3SortData& second) const
|
||||
{
|
||||
return (first.m_key < second.m_key) || (first.m_key==second.m_key && first.m_value < second.m_value);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
void boundSearchTest( )
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
int maxSize = 1024*256;
|
||||
int bucketSize = 256;
|
||||
|
||||
b3OpenCLArray<b3SortData> srcCL(g_context,g_queue,maxSize);
|
||||
b3OpenCLArray<unsigned int> upperCL(g_context,g_queue,maxSize);
|
||||
b3OpenCLArray<unsigned int> lowerCL(g_context,g_queue,maxSize);
|
||||
|
||||
b3AlignedObjectArray<b3SortData> srcHost;
|
||||
b3AlignedObjectArray<unsigned int> upperHost;
|
||||
b3AlignedObjectArray<unsigned int> lowerHost;
|
||||
b3AlignedObjectArray<unsigned int> upperHostCompare;
|
||||
b3AlignedObjectArray<unsigned int> lowerHostCompare;
|
||||
|
||||
b3BoundSearchCL* search = new b3BoundSearchCL(g_context,g_device,g_queue, maxSize);
|
||||
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
|
||||
int size = b3Min( 128+dx*iter, maxSize );
|
||||
|
||||
upperHost.resize(bucketSize);
|
||||
lowerHost.resize(bucketSize);
|
||||
upperHostCompare.resize(bucketSize);
|
||||
lowerHostCompare.resize(bucketSize);
|
||||
|
||||
srcHost.resize(size);
|
||||
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
b3SortData v;
|
||||
// v.m_key = i<2? 0 : 5;
|
||||
v.m_key = getRandom(0,bucketSize);
|
||||
|
||||
v.m_value = i;
|
||||
srcHost.at(i) = v;
|
||||
}
|
||||
|
||||
srcHost.quickSort(b3SortDataCompare());
|
||||
srcCL.copyFromHost(srcHost);
|
||||
|
||||
{
|
||||
|
||||
for(int i=0; i<bucketSize; i++)
|
||||
{
|
||||
lowerHost[i] = -1;
|
||||
lowerHostCompare[i] = -1;
|
||||
upperHost[i] = -1;
|
||||
upperHostCompare[i] = -1;
|
||||
}
|
||||
upperCL.copyFromHost(upperHost);
|
||||
lowerCL.copyFromHost(lowerHost);
|
||||
}
|
||||
|
||||
search->execute(srcCL,size,upperCL,bucketSize,b3BoundSearchCL::BOUND_UPPER);
|
||||
search->execute(srcCL,size,lowerCL,bucketSize,b3BoundSearchCL::BOUND_LOWER);
|
||||
|
||||
search->executeHost(srcHost,size,upperHostCompare,bucketSize,b3BoundSearchCL::BOUND_UPPER);
|
||||
search->executeHost(srcHost,size,lowerHostCompare,bucketSize,b3BoundSearchCL::BOUND_LOWER);
|
||||
|
||||
lowerCL.copyToHost(lowerHost);
|
||||
upperCL.copyToHost(upperHost);
|
||||
for(int i=0; i<bucketSize; i++)
|
||||
{
|
||||
TEST_ASSERT(upperHostCompare[i] == upperHost[i]);
|
||||
TEST_ASSERT(lowerHostCompare[i] == lowerHost[i]);
|
||||
}
|
||||
/*
|
||||
for(int i=1; i<bucketSize; i++)
|
||||
{
|
||||
int lhi_1 = lowerHost[i-1];
|
||||
int lhi = lowerHost[i];
|
||||
|
||||
for(int j=lhi_1; j<lhi; j++)
|
||||
//for(int j=lowerHost[i-1]; j<lowerHost[i]; j++)
|
||||
{
|
||||
TEST_ASSERT( srcHost[j].m_key < i );
|
||||
}
|
||||
}
|
||||
|
||||
for(int i=0; i<bucketSize; i++)
|
||||
{
|
||||
int jMin = (i==0)?0:upperHost[i-1];
|
||||
for(int j=jMin; j<upperHost[i]; j++)
|
||||
{
|
||||
TEST_ASSERT( srcHost[j].m_key <= i );
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
for(int i=0; i<bucketSize; i++)
|
||||
{
|
||||
int lhi = lowerHost[i];
|
||||
int uhi = upperHost[i];
|
||||
|
||||
for(int j=lhi; j<uhi; j++)
|
||||
{
|
||||
if ( srcHost[j].m_key != i )
|
||||
{
|
||||
printf("error %d != %d\n",srcHost[j].m_key,i);
|
||||
}
|
||||
TEST_ASSERT( srcHost[j].m_key == i );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
delete search;
|
||||
|
||||
TEST_REPORT( "boundSearchTest" );
|
||||
}
|
||||
|
||||
|
||||
void prefixScanTest()
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
int maxSize = 1024*256;
|
||||
|
||||
b3AlignedObjectArray<unsigned int> buf0Host;
|
||||
b3AlignedObjectArray<unsigned int> buf1Host;
|
||||
|
||||
b3OpenCLArray<unsigned int> buf2CL(g_context,g_queue,maxSize);
|
||||
b3OpenCLArray<unsigned int> buf3CL(g_context,g_queue,maxSize);
|
||||
|
||||
|
||||
b3PrefixScanCL* scan = new b3PrefixScanCL(g_context,g_device,g_queue,maxSize);
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = b3Min( 128+dx*iter, maxSize );
|
||||
buf0Host.resize(size);
|
||||
buf1Host.resize(size);
|
||||
|
||||
for(int i=0; i<size; i++)
|
||||
buf0Host[i] = 1;
|
||||
|
||||
buf2CL.copyFromHost( buf0Host);
|
||||
|
||||
unsigned int sumHost, sumGPU;
|
||||
|
||||
scan->executeHost(buf0Host, buf1Host, size, &sumHost );
|
||||
scan->execute( buf2CL, buf3CL, size, &sumGPU );
|
||||
|
||||
buf3CL.copyToHost(buf0Host);
|
||||
|
||||
TEST_ASSERT( sumHost == sumGPU );
|
||||
for(int i=0; i<size; i++)
|
||||
TEST_ASSERT( buf1Host[i] == buf0Host[i] );
|
||||
}
|
||||
|
||||
delete scan;
|
||||
|
||||
TEST_REPORT( "scanTest" );
|
||||
}
|
||||
|
||||
|
||||
bool radixSortTest()
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
int maxSize = 1024*256;
|
||||
|
||||
b3AlignedObjectArray<b3SortData> buf0Host;
|
||||
buf0Host.resize(maxSize);
|
||||
b3AlignedObjectArray<b3SortData> buf1Host;
|
||||
buf1Host.resize(maxSize );
|
||||
b3OpenCLArray<b3SortData> buf2CL(g_context,g_queue,maxSize);
|
||||
|
||||
b3RadixSort32CL* sort = new b3RadixSort32CL(g_context,g_device,g_queue,maxSize);
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = b3Min( 128+dx*iter, maxSize-512 );
|
||||
size = NEXTMULTIPLEOF( size, 512 );//not necessary
|
||||
|
||||
buf0Host.resize(size);
|
||||
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
b3SortData v;
|
||||
v.m_key = getRandom(0,0xff);
|
||||
v.m_value = i;
|
||||
buf0Host[i] = v;
|
||||
}
|
||||
|
||||
buf2CL.copyFromHost( buf0Host);
|
||||
|
||||
|
||||
sort->executeHost( buf0Host);
|
||||
sort->execute(buf2CL);
|
||||
|
||||
buf2CL.copyToHost(buf1Host);
|
||||
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
TEST_ASSERT( buf0Host[i].m_value == buf1Host[i].m_value && buf0Host[i].m_key == buf1Host[i].m_key );
|
||||
}
|
||||
}
|
||||
|
||||
delete sort;
|
||||
|
||||
TEST_REPORT( "radixSort" );
|
||||
|
||||
return g_testFailed;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
int preferredDeviceIndex = -1;
|
||||
int preferredPlatformIndex = -1;
|
||||
|
||||
b3CommandLineArgs args(argc, argv);
|
||||
args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
|
||||
args.GetCmdLineArgument("platformId", preferredPlatformIndex);
|
||||
|
||||
initCL(preferredDeviceIndex,preferredPlatformIndex);
|
||||
|
||||
fillIntTest();
|
||||
|
||||
boundSearchTest();
|
||||
|
||||
prefixScanTest();
|
||||
|
||||
radixSortTest();
|
||||
|
||||
exitCL();
|
||||
|
||||
printf("%d tests passed, %d tests failed\n",g_nPassed, g_nFailed);
|
||||
printf("End, press <enter>\n");
|
||||
getchar();
|
||||
}
|
||||
|
@ -1,41 +0,0 @@
|
||||
function createProject(vendor)
|
||||
hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ("OpenCL_primitives_test_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../../bin"
|
||||
includedirs {".","..","../../../src"}
|
||||
|
||||
|
||||
files {
|
||||
"main.cpp",
|
||||
"../../basic_initialize/b3OpenCLInclude.h",
|
||||
"../../basic_initialize/b3OpenCLUtils.cpp",
|
||||
"../../basic_initialize/b3OpenCLUtils.h",
|
||||
"../host/b3FillCL.cpp",
|
||||
"../host/b3FillCL.h",
|
||||
"../host/b3BoundSearchCL.cpp",
|
||||
"../host/b3BoundSearchCL.h",
|
||||
"../host/b3PrefixScanCL.cpp",
|
||||
"../host/b3PrefixScanCL.h",
|
||||
"../host/b3RadixSort32CL.cpp",
|
||||
"../host/b3RadixSort32CL.h",
|
||||
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
|
||||
"../../../src/Bullet3Common/b3AlignedAllocator.h",
|
||||
"../../../src/Bullet3Common/b3AlignedObjectArray.h",
|
||||
}
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("Intel")
|
||||
createProject("NVIDIA")
|
||||
createProject("Apple")
|
@ -1,116 +0,0 @@
|
||||
///original author: Erwin Coumans
|
||||
#include "b3OpenCLUtils.h"
|
||||
#include "../parallel_primitives/host/b3OpenCLArray.h"
|
||||
#include "../parallel_primitives/host/b3LauncherCL.h"
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
#define MSTRINGIFY(A) #A
|
||||
const char* kernelString= MSTRINGIFY(
|
||||
__kernel void ReduceGlobal(__global int* d_in, __global int* d_out, int numElements)
|
||||
{
|
||||
int myId = get_global_id(0);
|
||||
int tid = get_local_id(0);
|
||||
|
||||
|
||||
int ls = get_local_size(0);
|
||||
for (unsigned int s=ls/2;s>0;s>>=1)
|
||||
{
|
||||
if (myId<numElements)
|
||||
{
|
||||
if (tid<s)
|
||||
{
|
||||
d_in[myId] += d_in[myId+s];
|
||||
}
|
||||
}
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
}
|
||||
if (tid==0)
|
||||
{
|
||||
if (myId<numElements)
|
||||
{
|
||||
d_out[get_group_id(0)]=d_in[myId];
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int ciErrNum = 0;
|
||||
int preferred_device = -1;
|
||||
int preferred_platform = -1;
|
||||
cl_platform_id platformId;
|
||||
cl_context ctx;
|
||||
cl_command_queue queue;
|
||||
cl_device_id device;
|
||||
cl_kernel addKernel;
|
||||
ctx = b3OpenCLUtils::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
|
||||
b3OpenCLUtils::printPlatformInfo(platformId);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
if (!ctx) {
|
||||
printf("No OpenCL capable GPU found!");
|
||||
return 0;
|
||||
}
|
||||
|
||||
device = b3OpenCLUtils::getDevice(ctx,0);
|
||||
queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
|
||||
addKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,kernelString,"ReduceGlobal",&ciErrNum);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
int numElements = 1024*1024;
|
||||
b3OpenCLArray<int> a(ctx,queue);
|
||||
b3OpenCLArray<int> b(ctx,queue);
|
||||
b3AlignedObjectArray<int> hostA;
|
||||
b3AlignedObjectArray<int> hostB;
|
||||
|
||||
for (int i=0;i<numElements;i++)
|
||||
{
|
||||
hostA.push_back(1);
|
||||
hostB.push_back(0.f);
|
||||
}
|
||||
a.copyFromHost(hostA);
|
||||
b.copyFromHost(hostB);
|
||||
|
||||
int hostSum= 0;
|
||||
for (int i=0;i<numElements;i++)
|
||||
{
|
||||
hostSum += hostA.at(i);
|
||||
}
|
||||
b.resize(numElements);
|
||||
|
||||
{
|
||||
b3LauncherCL launcher( queue, addKernel);
|
||||
launcher.setBuffer( a.getBufferCL());
|
||||
launcher.setBuffer( b.getBufferCL());
|
||||
launcher.setConst( numElements );
|
||||
launcher.launch1D( numElements,1024);
|
||||
}
|
||||
clFinish(queue);
|
||||
{
|
||||
b3LauncherCL launcher( queue, addKernel);
|
||||
launcher.setBuffer( b.getBufferCL());
|
||||
launcher.setBuffer( a.getBufferCL());
|
||||
launcher.setConst( 1024 );
|
||||
launcher.launch1D( 1024,1024);
|
||||
}
|
||||
clFinish(queue);
|
||||
|
||||
printf("hostSum = %d\n", hostSum);
|
||||
|
||||
int clSum = a.at(0);
|
||||
printf("clSum = %d\n", clSum );
|
||||
if (hostSum != clSum)
|
||||
{
|
||||
printf("Incorrect result\n");
|
||||
} else
|
||||
{
|
||||
printf("Correct result\n");
|
||||
}
|
||||
|
||||
|
||||
clReleaseCommandQueue(queue);
|
||||
clReleaseContext(ctx);
|
||||
printf("press key\n");
|
||||
getchar();
|
||||
return 0;
|
||||
}
|
@ -1,41 +0,0 @@
|
||||
|
||||
function createProject (vendor)
|
||||
|
||||
local hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ( "OpenCL_reduce_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../bin"
|
||||
|
||||
links {
|
||||
"OpenCL_lib_parallel_primitives_host_" .. vendor
|
||||
}
|
||||
|
||||
includedirs {
|
||||
"../basic_initialize",
|
||||
"../../src"
|
||||
}
|
||||
|
||||
files {
|
||||
"main.cpp",
|
||||
"../basic_initialize/b3OpenCLUtils.cpp",
|
||||
"../basic_initialize/b3OpenCLUtils.h",
|
||||
"../../src/Bullet3Common/b3AlignedAllocator.cpp",
|
||||
"../../src/Bullet3Common/b3AlignedAllocator.h",
|
||||
"../../src/Bullet3Common/b3AlignedObjectArray.h",
|
||||
}
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("NVIDIA")
|
||||
createProject("Intel")
|
||||
createProject("Apple")
|
@ -1,16 +0,0 @@
|
||||
|
||||
|
||||
__kernel void VectorAdd(__global const float8* a, __global const float8* b, __global float8* c, int numElements)
|
||||
{
|
||||
// get oct-float index into global data array
|
||||
int iGID = get_global_id(0);
|
||||
if (iGID>=numElements)
|
||||
return;
|
||||
|
||||
float8 aGID = a[iGID];
|
||||
float8 bGID = b[iGID];
|
||||
|
||||
float8 result = aGID + bGID;
|
||||
// write back out to GMEM
|
||||
c[iGID] = result;
|
||||
}
|
@ -1,20 +0,0 @@
|
||||
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
|
||||
static const char* vectorAddCL= \
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void VectorAdd(__global const float8* a, __global const float8* b, __global float8* c, int numElements)\n"
|
||||
"{\n"
|
||||
" // get oct-float index into global data array\n"
|
||||
" int iGID = get_global_id(0);\n"
|
||||
" if (iGID>=numElements)\n"
|
||||
" return;\n"
|
||||
"\n"
|
||||
" float8 aGID = a[iGID];\n"
|
||||
" float8 bGID = b[iGID];\n"
|
||||
"\n"
|
||||
" float8 result = aGID + bGID;\n"
|
||||
" // write back out to GMEM\n"
|
||||
" c[iGID] = result;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
;
|
@ -1,408 +0,0 @@
|
||||
|
||||
///VectorAdd sample, from the NVidia JumpStart Guide
|
||||
///http://developer.download.nvidia.com/OpenCL/NVIDIA_OpenCL_JumpStart_Guide.pdf
|
||||
|
||||
///Instead of #include <CL/cl.h> we include <MiniCL/cl.h>
|
||||
///Apart from this include file, all other code should compile and work on OpenCL compliant implementation
|
||||
|
||||
|
||||
#define LOAD_FROM_FILE
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <OpenCL/OpenCL.h>
|
||||
#else
|
||||
#include <CL/cl.h>
|
||||
#endif //__APPLE__
|
||||
#ifdef _WIN32
|
||||
#pragma warning (disable:4996)
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#define GRID3DOCL_CHECKERROR(a, b) if((a)!=(b)) { printf("3D GRID OCL Error : %d\n", (a)); b3Assert((a) == (b)); }
|
||||
size_t wgSize;
|
||||
|
||||
#include "VectorAddKernels.h"
|
||||
|
||||
#ifdef CL_PLATFORM_INTEL
|
||||
const char* preferredPlatform = "Intel(R) Corporation";
|
||||
#elif defined CL_PLATFORM_AMD
|
||||
const char* preferredPlatform = "Advanced Micro Devices, Inc.";
|
||||
#elif defined CL_PLATFORM_NVIDIA
|
||||
const char* preferredPlatform = "NVIDIA Corporation";
|
||||
#else
|
||||
const char* preferredPlatform = "Unknown";
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
|
||||
{
|
||||
// locals
|
||||
FILE* pFileStream = NULL;
|
||||
size_t szSourceLength;
|
||||
|
||||
// open the OpenCL source code file
|
||||
pFileStream = fopen(cFilename, "rb");
|
||||
if(pFileStream == 0)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
size_t szPreambleLength = strlen(cPreamble);
|
||||
|
||||
// get the length of the source code
|
||||
fseek(pFileStream, 0, SEEK_END);
|
||||
szSourceLength = ftell(pFileStream);
|
||||
fseek(pFileStream, 0, SEEK_SET);
|
||||
|
||||
// allocate a buffer for the source code string and read it in
|
||||
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
|
||||
memcpy(cSourceString, cPreamble, szPreambleLength);
|
||||
fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream);
|
||||
|
||||
// close the file and return the total length of the combined (preamble + source) string
|
||||
fclose(pFileStream);
|
||||
if(szFinalLength != 0)
|
||||
{
|
||||
*szFinalLength = szSourceLength + szPreambleLength;
|
||||
}
|
||||
cSourceString[szSourceLength + szPreambleLength] = '\0';
|
||||
|
||||
return cSourceString;
|
||||
}
|
||||
|
||||
size_t workitem_size[3];
|
||||
|
||||
void printDevInfo(cl_device_id device)
|
||||
{
|
||||
char device_string[1024];
|
||||
|
||||
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
|
||||
printf( " Device %s:\n", device_string);
|
||||
|
||||
// CL_DEVICE_INFO
|
||||
cl_device_type type;
|
||||
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
|
||||
if( type & CL_DEVICE_TYPE_CPU )
|
||||
printf(" CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_CPU");
|
||||
if( type & CL_DEVICE_TYPE_GPU )
|
||||
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_GPU");
|
||||
if( type & CL_DEVICE_TYPE_ACCELERATOR )
|
||||
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
|
||||
if( type & CL_DEVICE_TYPE_DEFAULT )
|
||||
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
|
||||
|
||||
// CL_DEVICE_MAX_COMPUTE_UNITS
|
||||
cl_uint compute_units;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
|
||||
printf( " CL_DEVICE_MAX_COMPUTE_UNITS:\t%d\n", compute_units);
|
||||
|
||||
// CL_DEVICE_MAX_WORK_GROUP_SIZE
|
||||
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
|
||||
printf( " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// Main function
|
||||
// *********************************************************************
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
void *srcA, *srcB, *dst; // Host buffers for OpenCL test
|
||||
cl_context cxGPUContext; // OpenCL context
|
||||
cl_command_queue cqCommandQue; // OpenCL command que
|
||||
cl_device_id* cdDevices; // OpenCL device list
|
||||
cl_program cpProgram; // OpenCL program
|
||||
cl_kernel ckKernel; // OpenCL kernel
|
||||
cl_mem cmMemObjs[3]; // OpenCL memory buffer objects: 3 for device
|
||||
size_t szGlobalWorkSize[1]; // 1D var for Total # of work items
|
||||
size_t szLocalWorkSize[1]; // 1D var for # of work items in the work group
|
||||
size_t szParmDataBytes; // Byte size of context information
|
||||
cl_int ciErr1, ciErr2; // Error code var
|
||||
|
||||
|
||||
int iTestN = 100000 * 8; // Size of Vectors to process
|
||||
|
||||
int actualGlobalSize = iTestN / 8;
|
||||
|
||||
|
||||
// set Global and Local work size dimensions
|
||||
szGlobalWorkSize[0] = iTestN >> 3; // do 8 computations per work item
|
||||
szLocalWorkSize[0]= iTestN>>3;
|
||||
|
||||
|
||||
// Allocate and initialize host arrays
|
||||
srcA = (void *)malloc (sizeof(cl_float) * iTestN);
|
||||
srcB = (void *)malloc (sizeof(cl_float) * iTestN);
|
||||
dst = (void *)malloc (sizeof(cl_float) * iTestN);
|
||||
|
||||
int i;
|
||||
|
||||
// Initialize arrays with some values
|
||||
for (i=0;i<iTestN;i++)
|
||||
{
|
||||
((cl_float*)srcA)[i] = cl_float(i);
|
||||
((cl_float*)srcB)[i] = 2;
|
||||
((cl_float*)dst)[i]=-1;
|
||||
}
|
||||
|
||||
|
||||
cl_uint numPlatforms;
|
||||
cl_platform_id platform = NULL;
|
||||
cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
|
||||
|
||||
if (0 < numPlatforms)
|
||||
{
|
||||
cl_platform_id* platforms = new cl_platform_id[numPlatforms];
|
||||
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
|
||||
|
||||
for (unsigned i = 0; i < numPlatforms; ++i)
|
||||
{
|
||||
char pbuf[100];
|
||||
status = clGetPlatformInfo(platforms[i],
|
||||
CL_PLATFORM_VENDOR,
|
||||
sizeof(pbuf),
|
||||
pbuf,
|
||||
NULL);
|
||||
|
||||
platform = platforms[i];
|
||||
if (!strcmp(pbuf, preferredPlatform))
|
||||
{
|
||||
printf("Found platform %s\n", preferredPlatform);
|
||||
break;
|
||||
}
|
||||
}
|
||||
delete[] platforms;
|
||||
}
|
||||
|
||||
cl_context_properties cps[3] =
|
||||
{
|
||||
CL_CONTEXT_PLATFORM,
|
||||
(cl_context_properties)platform,
|
||||
0
|
||||
};
|
||||
|
||||
// Create OpenCL context & context
|
||||
cxGPUContext = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
|
||||
|
||||
// Query all devices available to the context
|
||||
ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
|
||||
cdDevices = (cl_device_id*)malloc(szParmDataBytes);
|
||||
ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
|
||||
if (cdDevices)
|
||||
{
|
||||
printDevInfo(cdDevices[0]);
|
||||
}
|
||||
|
||||
// Create a command queue for first device the context reported
|
||||
cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr2);
|
||||
ciErr1 |= ciErr2;
|
||||
|
||||
// Allocate the OpenCL source and result buffer memory objects on the device GMEM
|
||||
cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcA, &ciErr2);
|
||||
ciErr1 |= ciErr2;
|
||||
cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcB, &ciErr2);
|
||||
ciErr1 |= ciErr2;
|
||||
cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float8) * szGlobalWorkSize[0], NULL, &ciErr2);
|
||||
ciErr1 |= ciErr2;
|
||||
|
||||
///create kernels from binary
|
||||
int numDevices = 1;
|
||||
::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t));
|
||||
const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*));
|
||||
|
||||
for (i = 0; i < numDevices; ++i) {
|
||||
images[i] = 0;
|
||||
lengths[i] = 0;
|
||||
}
|
||||
|
||||
|
||||
// Read the OpenCL kernel in from source file
|
||||
const char* cSourceFile = "opencl/vector_add/VectorAddKernels.cl";
|
||||
|
||||
|
||||
const char* cPathAndName = cSourceFile;
|
||||
#ifdef LOAD_FROM_FILE
|
||||
size_t szKernelLength;
|
||||
|
||||
const char* cSourceCL =0;
|
||||
char relativeFileName[1024];
|
||||
|
||||
{
|
||||
const char* prefix[]={"../","../../","../../../","../../../../"};
|
||||
int numPrefixes = sizeof(prefix)/sizeof(char*);
|
||||
|
||||
for (int i=0;!cSourceCL && i<numPrefixes;i++)
|
||||
{
|
||||
|
||||
sprintf(relativeFileName,"%s%s",prefix[i],cSourceFile);
|
||||
cSourceCL = loadProgSource(relativeFileName, "", &szKernelLength);
|
||||
if (cSourceCL)
|
||||
{
|
||||
printf("Loaded program source: %s\n", relativeFileName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!cSourceCL)
|
||||
{
|
||||
printf("Couldn't find file %s, exiting\n",cSourceFile);
|
||||
exit(0);
|
||||
}
|
||||
#else
|
||||
const char* cSourceCL = vectorAddCL;
|
||||
size_t szKernelLength = strlen(cSourceCL);
|
||||
#endif //LOAD_FROM_FILE
|
||||
|
||||
|
||||
|
||||
// Create the program
|
||||
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
|
||||
printf("clCreateProgramWithSource...\n");
|
||||
if (ciErr1 != CL_SUCCESS)
|
||||
{
|
||||
printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// Build the program with 'mad' Optimization option
|
||||
#ifdef MAC
|
||||
char* flags = "-cl-mad-enable -DMAC ";
|
||||
#else
|
||||
char flags[1024]={0};
|
||||
#ifdef CL_PLATFORM_INTEL
|
||||
sprintf(flags,"-g -s \"%s\"","C:/develop/experiments/opencl/vector_add/VectorAddKernels.cl");
|
||||
#endif//CL_PLATFORM_INTEL
|
||||
|
||||
#endif//MAC
|
||||
ciErr1 = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
|
||||
printf("clBuildProgram...\n");
|
||||
if (ciErr1 != CL_SUCCESS)
|
||||
{
|
||||
printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// Create the kernel
|
||||
ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
|
||||
printf("clCreateKernel (VectorAdd)...\n");
|
||||
if (ciErr1 != CL_SUCCESS)
|
||||
{
|
||||
printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
cl_int ciErrNum;
|
||||
|
||||
ciErrNum = clGetKernelWorkGroupInfo(ckKernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
printf("cannot get workgroup size\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// Set the Argument values
|
||||
ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]);
|
||||
ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]);
|
||||
ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]);
|
||||
ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(int), (void*)&actualGlobalSize);
|
||||
|
||||
printf("Press ENTER to quit\n");
|
||||
getchar();
|
||||
|
||||
int workgroupSize = wgSize;
|
||||
if(workgroupSize <= 0)
|
||||
{ // let OpenCL library calculate workgroup size
|
||||
size_t globalWorkSize[2];
|
||||
globalWorkSize[0] = actualGlobalSize;
|
||||
globalWorkSize[1] = 1;
|
||||
|
||||
// Copy input data from host to GPU and launch kernel
|
||||
ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalWorkSize, NULL, 0,0,0 );
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t localWorkSize[2], globalWorkSize[2];
|
||||
//workgroupSize = b3Min(workgroupSize, actualGlobalSize);
|
||||
int num_t = actualGlobalSize / workgroupSize;
|
||||
int num_g = num_t * workgroupSize;
|
||||
if(num_g < actualGlobalSize)
|
||||
{
|
||||
num_t++;
|
||||
//this can cause problems -> processing outside of the buffer
|
||||
//make sure to check kernel
|
||||
}
|
||||
|
||||
size_t globalThreads[] = {num_t * workgroupSize};
|
||||
size_t localThreads[] = {workgroupSize};
|
||||
|
||||
|
||||
localWorkSize[0] = workgroupSize;
|
||||
globalWorkSize[0] = num_t * workgroupSize;
|
||||
localWorkSize[1] = 1;
|
||||
globalWorkSize[1] = 1;
|
||||
|
||||
// Copy input data from host to GPU and launch kernel
|
||||
ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL);
|
||||
|
||||
}
|
||||
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
printf("cannot clEnqueueNDRangeKernel\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
clFinish(cqCommandQue);
|
||||
// Read back results and check accumulated errors
|
||||
ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL);
|
||||
|
||||
// Release kernel, program, and memory objects
|
||||
// NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
|
||||
free(cdDevices);
|
||||
clReleaseKernel(ckKernel);
|
||||
clReleaseProgram(cpProgram);
|
||||
clReleaseCommandQueue(cqCommandQue);
|
||||
clReleaseContext(cxGPUContext);
|
||||
|
||||
|
||||
// print the results
|
||||
int iErrorCount = 0;
|
||||
for (i = 0; i < iTestN; i++)
|
||||
{
|
||||
if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i])
|
||||
iErrorCount++;
|
||||
}
|
||||
|
||||
if (iErrorCount)
|
||||
{
|
||||
printf("Validation FAILED\n");
|
||||
} else
|
||||
{
|
||||
printf("Validation SUCCESSFULL\n");
|
||||
}
|
||||
// Free host memory, close log and return success
|
||||
for (i = 0; i < 3; i++)
|
||||
{
|
||||
clReleaseMemObject(cmMemObjs[i]);
|
||||
}
|
||||
|
||||
free(srcA);
|
||||
free(srcB);
|
||||
free (dst);
|
||||
printf("Press ENTER to quit\n");
|
||||
getchar();
|
||||
}
|
||||
|
||||
|
@ -1,69 +0,0 @@
|
||||
///original author: Erwin Coumans
|
||||
#include "b3OpenCLUtils.h"
|
||||
#include "../parallel_primitives/host/b3OpenCLArray.h"
|
||||
#include "../parallel_primitives/host/b3LauncherCL.h"
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
#define MSTRINGIFY(A) #A
|
||||
const char* kernelString= MSTRINGIFY(
|
||||
__kernel void VectorAdd(__global const float* a, __global const float* b, __global float* c, int numElements)
|
||||
{
|
||||
int iGID = get_global_id(0);
|
||||
if (iGID>=numElements)
|
||||
return;
|
||||
float aGID = a[iGID];
|
||||
float bGID = b[iGID];
|
||||
float result = aGID + bGID;
|
||||
c[iGID] = result;
|
||||
}
|
||||
);
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int ciErrNum = 0;
|
||||
int preferred_device = -1;
|
||||
int preferred_platform = -1;
|
||||
cl_platform_id platformId;
|
||||
cl_context ctx;
|
||||
cl_command_queue queue;
|
||||
cl_device_id device;
|
||||
cl_kernel addKernel;
|
||||
ctx = b3OpenCLUtils::createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
|
||||
b3OpenCLUtils::printPlatformInfo(platformId);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
if (!ctx) {
|
||||
printf("No OpenCL capable GPU found!");
|
||||
return 0;
|
||||
}
|
||||
|
||||
device = b3OpenCLUtils::getDevice(ctx,0);
|
||||
queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
|
||||
addKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,kernelString,"VectorAdd",&ciErrNum);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
int numElements = 32;
|
||||
b3OpenCLArray<float> a(ctx,queue);
|
||||
b3OpenCLArray<float> b(ctx,queue);
|
||||
b3OpenCLArray<float> c(ctx,queue);
|
||||
for (int i=0;i<numElements;i++)
|
||||
{
|
||||
a.push_back(float(i));
|
||||
b.push_back(float(i));
|
||||
}
|
||||
|
||||
c.resize(numElements);
|
||||
b3LauncherCL launcher( queue, addKernel);
|
||||
launcher.setBuffer( a.getBufferCL());
|
||||
launcher.setBuffer( b.getBufferCL());
|
||||
launcher.setBuffer( c.getBufferCL());
|
||||
launcher.setConst( numElements );
|
||||
launcher.launch1D( numElements);
|
||||
for (int i=0;i<numElements;i++)
|
||||
{
|
||||
float v = c.at(i);
|
||||
printf("c[%d]=%f\n",i,v);
|
||||
}
|
||||
clReleaseCommandQueue(queue);
|
||||
clReleaseContext(ctx);
|
||||
return 0;
|
||||
}
|
@ -1,41 +0,0 @@
|
||||
|
||||
function createProject (vendor)
|
||||
|
||||
local hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ( "OpenCL_vector_add_simplified_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../bin"
|
||||
|
||||
links {
|
||||
"OpenCL_lib_parallel_primitives_host_" .. vendor
|
||||
}
|
||||
|
||||
includedirs {
|
||||
"../basic_initialize",
|
||||
"../../src"
|
||||
}
|
||||
|
||||
files {
|
||||
"main.cpp",
|
||||
"../basic_initialize/b3OpenCLUtils.cpp",
|
||||
"../basic_initialize/b3OpenCLUtils.h",
|
||||
"../../src/Bullet3Common/b3AlignedAllocator.cpp",
|
||||
"../../src/Bullet3Common/b3AlignedAllocator.h",
|
||||
"../../src/Bullet3Common/b3AlignedObjectArray.h",
|
||||
}
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("NVIDIA")
|
||||
createProject("Intel")
|
||||
createProject("Apple")
|
@ -1,15 +1,15 @@
|
||||
|
||||
#include "b3GpuSapBroadphase.h"
|
||||
#include "Bullet3Common/b3Vector3.h"
|
||||
#include "parallel_primitives/host/b3LauncherCL.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
|
||||
#include "Bullet3Common/b3Quickprof.h"
|
||||
#include "basic_initialize/b3OpenCLUtils.h"
|
||||
|
||||
|
||||
#include "../kernels/sapKernels.h"
|
||||
#include "../kernels/sapFastKernels.h"
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "kernels/sapKernels.h"
|
||||
#include "kernels/sapFastKernels.h"
|
||||
#include "Bullet3Common/b3MinMax.h"
|
||||
|
||||
#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
|
||||
#define B3_BROADPHASE_SAPFAST_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl"
|
||||
|
||||
b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q )
|
||||
:m_context(ctx),
|
||||
@ -28,9 +28,9 @@ m_currentBuffer(-1)
|
||||
|
||||
cl_int errNum=0;
|
||||
|
||||
cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"","opencl/gpu_broadphase/kernels/sap.cl");
|
||||
cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"","opencl/gpu_broadphase/kernels/sapFast.cl");
|
||||
cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"",B3_BROADPHASE_SAPFAST_PATH);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
|
||||
|
@ -1,10 +1,10 @@
|
||||
#ifndef B3_GPU_SAP_BROADPHASE_H
|
||||
#define B3_GPU_SAP_BROADPHASE_H
|
||||
|
||||
#include "parallel_primitives/host/b3OpenCLArray.h"
|
||||
#include "parallel_primitives/host/b3FillCL.h" //b3Int2
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
|
||||
class b3Vector3;
|
||||
#include "parallel_primitives/host/b3RadixSort32CL.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
|
||||
|
||||
#include "b3SapAabb.h"
|
||||
|
@ -29,18 +29,23 @@ typedef b3AlignedObjectArray<b3Vector3> b3VertexArray;
|
||||
#include "Bullet3Common/b3Quickprof.h"
|
||||
|
||||
#include <float.h> //for FLT_MAX
|
||||
#include "basic_initialize/b3OpenCLUtils.h"
|
||||
#include "parallel_primitives/host/b3LauncherCL.h"
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
|
||||
//#include "AdlQuaternion.h"
|
||||
|
||||
#include "../kernels/satKernels.h"
|
||||
#include "../kernels/satClipHullContacts.h"
|
||||
#include "../kernels/bvhTraversal.h"
|
||||
#include "../kernels/primitiveContacts.h"
|
||||
#include "kernels/satKernels.h"
|
||||
#include "kernels/satClipHullContacts.h"
|
||||
#include "kernels/bvhTraversal.h"
|
||||
#include "kernels/primitiveContacts.h"
|
||||
|
||||
|
||||
#include "Bullet3Geometry/b3AabbUtil.h"
|
||||
|
||||
#define BT_NARROWPHASE_SAT_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl"
|
||||
#define BT_NARROWPHASE_CLIPHULL_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl"
|
||||
#define BT_NARROWPHASE_BVH_TRAVERSAL_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl"
|
||||
#define BT_NARROWPHASE_PRIMITIVE_CONTACT_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl"
|
||||
|
||||
|
||||
#define dot3F4 b3Dot
|
||||
|
||||
@ -64,7 +69,7 @@ m_totalContactsOut(m_context, m_queue)
|
||||
// sprintf(flags,"-g -s \"%s\"","C:/develop/bullet3_experiments2/opencl/gpu_narrowphase/kernels/sat.cl");
|
||||
//#endif
|
||||
|
||||
cl_program satProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,src,&errNum,flags,"opencl/gpu_narrowphase/kernels/sat.cl");
|
||||
cl_program satProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,src,&errNum,flags,BT_NARROWPHASE_SAT_PATH);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
|
||||
m_findSeparatingAxisKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,src, "findSeparatingAxisKernel",&errNum,satProg );
|
||||
@ -92,7 +97,7 @@ m_totalContactsOut(m_context, m_queue)
|
||||
// sprintf(flags,"-g -s \"%s\"","C:/develop/bullet3_experiments2/opencl/gpu_narrowphase/kernels/satClipHullContacts.cl");
|
||||
//#endif
|
||||
|
||||
cl_program satClipContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcClip,&errNum,flags,"opencl/gpu_narrowphase/kernels/satClipHullContacts.cl");
|
||||
cl_program satClipContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcClip,&errNum,flags,BT_NARROWPHASE_CLIPHULL_PATH);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
|
||||
m_clipHullHullKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipHullHullKernel",&errNum,satClipContactsProg);
|
||||
@ -132,7 +137,7 @@ m_totalContactsOut(m_context, m_queue)
|
||||
if (1)
|
||||
{
|
||||
const char* srcBvh = bvhTraversalKernelCL;
|
||||
cl_program bvhTraversalProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcBvh,&errNum,"","opencl/gpu_narrowphase/kernels/bvhTraversal.cl");
|
||||
cl_program bvhTraversalProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcBvh,&errNum,"",BT_NARROWPHASE_BVH_TRAVERSAL_PATH);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
|
||||
m_bvhTraversalKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcBvh, "bvhTraversalKernel",&errNum,bvhTraversalProg,"");
|
||||
@ -142,7 +147,7 @@ m_totalContactsOut(m_context, m_queue)
|
||||
|
||||
{
|
||||
const char* primitiveContactsSrc = primitiveContactsKernelsCL;
|
||||
cl_program primitiveContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,primitiveContactsSrc,&errNum,"","opencl/gpu_narrowphase/kernels/primitiveContacts.cl");
|
||||
cl_program primitiveContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,primitiveContactsSrc,&errNum,"",BT_NARROWPHASE_PRIMITIVE_CONTACT_PATH);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
|
||||
m_primitiveContactsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,primitiveContactsSrc, "primitiveContactsKernel",&errNum,primitiveContactsProg,"");
|
||||
@ -527,7 +532,7 @@ void computeContactPlaneConvex(int pairIndex,
|
||||
b3Vector3 pOnB1 = contactPoints[contactIdx.s[i]];
|
||||
c->m_worldPos[i] = pOnB1;
|
||||
}
|
||||
c->m_worldNormal[3] = numReducedPoints;
|
||||
c->m_worldNormal[3] = (b3Scalar)numReducedPoints;
|
||||
}//if (dstIdx < numPairs)
|
||||
}
|
||||
|
||||
@ -665,7 +670,7 @@ void computeContactPlaneCompound(int pairIndex,
|
||||
b3Vector3 pOnB1 = contactPoints[contactIdx.s[i]];
|
||||
c->m_worldPos[i] = pOnB1;
|
||||
}
|
||||
c->m_worldNormal[3] = numReducedPoints;
|
||||
c->m_worldNormal[3] = (b3Scalar)numReducedPoints;
|
||||
}//if (dstIdx < numPairs)
|
||||
}
|
||||
|
||||
@ -825,7 +830,7 @@ void computeContactSphereConvex(int pairIndex,
|
||||
c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
|
||||
c->m_worldPos[0] = pOnB1;
|
||||
int numPoints = 1;
|
||||
c->m_worldNormal[3] = numPoints;
|
||||
c->m_worldNormal[3] = (b3Scalar)numPoints;
|
||||
}//if (dstIdx < numPairs)
|
||||
}
|
||||
}//if (hasCollision)
|
@ -2,15 +2,15 @@
|
||||
#ifndef _CONVEX_HULL_CONTACT_H
|
||||
#define _CONVEX_HULL_CONTACT_H
|
||||
|
||||
#include "parallel_primitives/host/b3OpenCLArray.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
|
||||
#include "Bullet3Collision/NarrowPhaseCollision/b3RigidBodyCL.h"
|
||||
#include "Bullet3Common/b3AlignedObjectArray.h"
|
||||
#include "b3ConvexUtility.h"
|
||||
#include "b3ConvexPolyhedronCL.h"
|
||||
#include "b3Collidable.h"
|
||||
#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
|
||||
#include "parallel_primitives/host/b3Int2.h"
|
||||
#include "parallel_primitives/host/b3Int4.h"
|
||||
#include "Bullet3Common/b3Int2.h"
|
||||
#include "Bullet3Common/b3Int4.h"
|
||||
#include "b3OptimizedBvh.h"
|
||||
#include "b3BvhInfo.h"
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user