bullet3/examples/MultiThreadedDemo/ParallelFor.h

/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/

This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose, 
including commercial applications, and to alter it and redistribute it freely, 
subject to the following restrictions:

1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/

#include <stdio.h> //printf debugging
#include <algorithm>


// choose threading providers:
#if BT_USE_TBB
#define USE_TBB 1     // use Intel Threading Building Blocks for thread management
#endif

#if BT_USE_PPL
#define USE_PPL 1     // use Microsoft Parallel Patterns Library (installed with Visual Studio 2010 and later)
#endif // BT_USE_PPL

#if BT_USE_OPENMP
#define USE_OPENMP 1  // use OpenMP (also need to change compiler options for OpenMP support)
#endif


#if USE_OPENMP

#include <omp.h>

#endif // #if USE_OPENMP


#if USE_PPL

#include <ppl.h>  // if you get a compile error here, check whether your version of Visual Studio includes PPL
// Visual Studio 2010 and later should come with it
#include <concrtrm.h>  // for GetProcessorCount()
#endif // #if USE_PPL


#if USE_TBB

#define __TBB_NO_IMPLICIT_LINKAGE 1
#include <tbb/tbb.h>
#include <tbb/task_scheduler_init.h>
#include <tbb/parallel_for.h>
#include <tbb/blocked_range.h>

#endif // #if USE_TBB


class TaskManager
{
public:
    enum Api
    {
        apiNone,
        apiOpenMP,
        apiTbb,
        apiPpl,
        apiCount
    };
    static const char* getApiName( Api api )
    {
        switch ( api )
        {
        case apiNone: return "None";
        case apiOpenMP: return "OpenMP";
        case apiTbb: return "Intel TBB";
        case apiPpl: return "MS PPL";
        default: return "unknown";
        }
    }

    TaskManager()
    {
        m_api = apiNone;
        m_numThreads = 0;
#if USE_TBB
        m_tbbSchedulerInit = NULL;
#endif // #if USE_TBB
    }

    Api getApi() const
    {
        return m_api;
    }

    bool isSupported( Api api ) const
    {
#if USE_OPENMP
        if ( api == apiOpenMP )
        {
            return true;
        }
#endif
#if USE_TBB
        if ( api == apiTbb )
        {
            return true;
        }
#endif
#if USE_PPL
        if ( api == apiPpl )
        {
            return true;
        }
#endif
        // apiNone is always "supported"
        return api == apiNone;
    }

    void setApi( Api api )
    {
        if (isSupported(api))
        {
            m_api = api;
        }
        else
        {
            // no compile time support for selected API, fallback to "none"
            m_api = apiNone;
        }
    }

    static int getMaxNumThreads()
    {
#if USE_OPENMP
        return omp_get_max_threads();
#elif USE_PPL
        return concurrency::GetProcessorCount();
#elif USE_TBB
        return tbb::task_scheduler_init::default_num_threads();
#endif
        return 1;
    }

    int getNumThreads() const
    {
        return m_numThreads;
    }

    int setNumThreads( int numThreads )
    {
        m_numThreads = ( std::max )( 1, numThreads );

#if USE_OPENMP
        omp_set_num_threads( m_numThreads );
#endif

#if USE_PPL
        {
            using namespace concurrency;
            if ( CurrentScheduler::Id() != -1 )
            {
                CurrentScheduler::Detach();
            }
            SchedulerPolicy policy;
            policy.SetConcurrencyLimits( m_numThreads, m_numThreads );
            CurrentScheduler::Create( policy );
        }
#endif

#if USE_TBB
        if ( m_tbbSchedulerInit )
        {
            delete m_tbbSchedulerInit;
            m_tbbSchedulerInit = NULL;
        }
        m_tbbSchedulerInit = new tbb::task_scheduler_init( m_numThreads );
#endif
        return m_numThreads;
    }

    void init()
    {
        if (m_numThreads == 0)
        {
#if USE_PPL
            setApi( apiPpl );
#endif
#if USE_TBB
            setApi( apiTbb );
#endif
#if USE_OPENMP
            setApi( apiOpenMP );
#endif
            setNumThreads(getMaxNumThreads());
        }
        else
        {
            setNumThreads(m_numThreads);
        }
    }

    void shutdown()
    {
#if USE_TBB
        if ( m_tbbSchedulerInit )
        {
            delete m_tbbSchedulerInit;
            m_tbbSchedulerInit = NULL;
        }
#endif
    }

private:
    Api m_api;
    int m_numThreads;
#if USE_TBB
    tbb::task_scheduler_init* m_tbbSchedulerInit;
#endif // #if USE_TBB
};

extern TaskManager gTaskMgr;


inline static void initTaskScheduler()
{
    gTaskMgr.init();
}

inline static void cleanupTaskScheduler()
{
    gTaskMgr.shutdown();
}


#if USE_TBB
///
/// TbbBodyAdapter -- Converts a body object that implements the
///                   "forLoop(int iBegin, int iEnd) const" function
///  into a TBB compatible object that takes a tbb::blocked_range<int> type.
///
template <class TBody>
struct TbbBodyAdapter
{
    const TBody* mBody;

    void operator()( const tbb::blocked_range<int>& range ) const
    {
        mBody->forLoop( range.begin(), range.end() );
    }
};
#endif // #if USE_TBB

#if USE_PPL
///
/// PplBodyAdapter -- Converts a body object that implements the
///                   "forLoop(int iBegin, int iEnd) const" function
///  into a PPL compatible object that implements "void operator()( int ) const"
///
template <class TBody>
struct PplBodyAdapter
{
    const TBody* mBody;
    int mGrainSize;
    int mIndexEnd;

    void operator()( int i ) const
    {
        mBody->forLoop( i, (std::min)(i + mGrainSize, mIndexEnd) );
    }
};
#endif // #if USE_PPL


///
/// parallelFor -- interface for submitting work expressed as a for loop to the worker threads
///
template <class TBody>
void parallelFor( int iBegin, int iEnd, int grainSize, const TBody& body )
{
#if USE_OPENMP
    if ( gTaskMgr.getApi() == TaskManager::apiOpenMP )
    {
#pragma omp parallel for schedule(static, 1)
        for ( int i = iBegin; i < iEnd; i += grainSize )
        {
            body.forLoop( i, (std::min)( i + grainSize, iEnd ) );
        }
        return;
    }
#endif // #if USE_OPENMP

#if USE_PPL
    if ( gTaskMgr.getApi() == TaskManager::apiPpl )
    {
        // PPL dispatch
        PplBodyAdapter<TBody> pplBody;
        pplBody.mBody = &body;
        pplBody.mGrainSize = grainSize;
        pplBody.mIndexEnd = iEnd;
        // note: MSVC 2010 doesn't support partitioner args, so avoid them
        concurrency::parallel_for( iBegin,
                                   iEnd,
                                   grainSize,
                                   pplBody
                                   );
        return;
    }
#endif //#if USE_PPL

#if USE_TBB
    if ( gTaskMgr.getApi() == TaskManager::apiTbb )
    {
        // TBB dispatch
        TbbBodyAdapter<TBody> tbbBody;
        tbbBody.mBody = &body;
        tbb::parallel_for( tbb::blocked_range<int>( iBegin, iEnd, grainSize ),
                           tbbBody,
                           tbb::simple_partitioner()
                           );
        return;
    }
#endif // #if USE_TBB

    {
        // run on main thread
        body.forLoop( iBegin, iEnd );
    }

}
MultiThreaded Demo: - fixing various race conditions throughout (usage of static vars, etc) - addition of a few lightweight mutexes (which are compiled out by default) - slight code rearrangement in discreteDynamicsWorld to facilitate multithreading - PoolAllocator::allocate() can now be called when pool is full without crashing (null pointer returned) - PoolAllocator allocate and freeMemory, are OPTIONALLY threadsafe (default is un-threadsafe) - CollisionDispatcher no longer checks if the pool allocator is full before calling allocate(), instead it just calls allocate() and checks if the return is null -- this avoids a race condition - SequentialImpulseConstraintSolver OPTIONALLY uses different logic in getOrInitSolverBody() to avoid a race condition with kinematic bodies - addition of 2 classes which together allow simulation islands to be run in parallel: - btSimulationIslandManagerMt - btDiscreteDynamicsWorldMt - MultiThreadedDemo example in the example browser demonstrating use of OpenMP, Microsoft PPL, and Intel TBB - use multithreading for other demos - benchmark demo: add parallel raycasting 2016-09-27 07:01:45 +00:00			`/*`
			`Bullet Continuous Collision Detection and Physics Library`
			`Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/`

			`This software is provided 'as-is', without any express or implied warranty.`
			`In no event will the authors be held liable for any damages arising from the use of this software.`
			`Permission is granted to anyone to use this software for any purpose,`
			`including commercial applications, and to alter it and redistribute it freely,`
			`subject to the following restrictions:`

			`1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.`
			`2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.`
			`3. This notice may not be removed or altered from any source distribution.`
			`*/`

			`#include <stdio.h> //printf debugging`
			`#include <algorithm>`


			`// choose threading providers:`
			`#if BT_USE_TBB`
			`#define USE_TBB 1 // use Intel Threading Building Blocks for thread management`
			`#endif`

			`#if BT_USE_PPL`
			`#define USE_PPL 1 // use Microsoft Parallel Patterns Library (installed with Visual Studio 2010 and later)`
			`#endif // BT_USE_PPL`

			`#if BT_USE_OPENMP`
			`#define USE_OPENMP 1 // use OpenMP (also need to change compiler options for OpenMP support)`
			`#endif`


			`#if USE_OPENMP`

			`#include <omp.h>`

			`#endif // #if USE_OPENMP`


			`#if USE_PPL`

			`#include <ppl.h> // if you get a compile error here, check whether your version of Visual Studio includes PPL`
			`// Visual Studio 2010 and later should come with it`
			`#include <concrtrm.h> // for GetProcessorCount()`
			`#endif // #if USE_PPL`


			`#if USE_TBB`

			`#define __TBB_NO_IMPLICIT_LINKAGE 1`
			`#include <tbb/tbb.h>`
			`#include <tbb/task_scheduler_init.h>`
			`#include <tbb/parallel_for.h>`
			`#include <tbb/blocked_range.h>`

			`#endif // #if USE_TBB`



			`class TaskManager`
			`{`
			`public:`
			`enum Api`
			`{`
			`apiNone,`
			`apiOpenMP,`
			`apiTbb,`
			`apiPpl,`
			`apiCount`
			`};`
			`static const char* getApiName( Api api )`
			`{`
			`switch ( api )`
			`{`
			`case apiNone: return "None";`
			`case apiOpenMP: return "OpenMP";`
			`case apiTbb: return "Intel TBB";`
			`case apiPpl: return "MS PPL";`
			`default: return "unknown";`
			`}`
			`}`

			`TaskManager()`
			`{`
			`m_api = apiNone;`
			`m_numThreads = 0;`
			`#if USE_TBB`
			`m_tbbSchedulerInit = NULL;`
			`#endif // #if USE_TBB`
			`}`

			`Api getApi() const`
			`{`
			`return m_api;`
			`}`

			`bool isSupported( Api api ) const`
			`{`
			`#if USE_OPENMP`
			`if ( api == apiOpenMP )`
			`{`
			`return true;`
			`}`
			`#endif`
			`#if USE_TBB`
			`if ( api == apiTbb )`
			`{`
			`return true;`
			`}`
			`#endif`
			`#if USE_PPL`
			`if ( api == apiPpl )`
			`{`
			`return true;`
			`}`
			`#endif`
			`// apiNone is always "supported"`
			`return api == apiNone;`
			`}`

			`void setApi( Api api )`
			`{`
			`if (isSupported(api))`
			`{`
			`m_api = api;`
			`}`
			`else`
			`{`
			`// no compile time support for selected API, fallback to "none"`
			`m_api = apiNone;`
			`}`
			`}`

			`static int getMaxNumThreads()`
			`{`
			`#if USE_OPENMP`
			`return omp_get_max_threads();`
			`#elif USE_PPL`
			`return concurrency::GetProcessorCount();`
			`#elif USE_TBB`
			`return tbb::task_scheduler_init::default_num_threads();`
			`#endif`
			`return 1;`
			`}`

			`int getNumThreads() const`
			`{`
			`return m_numThreads;`
			`}`

			`int setNumThreads( int numThreads )`
			`{`
			`m_numThreads = ( std::max )( 1, numThreads );`

			`#if USE_OPENMP`
			`omp_set_num_threads( m_numThreads );`
			`#endif`

			`#if USE_PPL`
			`{`
			`using namespace concurrency;`
			`if ( CurrentScheduler::Id() != -1 )`
			`{`
			`CurrentScheduler::Detach();`
			`}`
			`SchedulerPolicy policy;`
			`policy.SetConcurrencyLimits( m_numThreads, m_numThreads );`
			`CurrentScheduler::Create( policy );`
			`}`
			`#endif`

			`#if USE_TBB`
			`if ( m_tbbSchedulerInit )`
			`{`
			`delete m_tbbSchedulerInit;`
			`m_tbbSchedulerInit = NULL;`
			`}`
			`m_tbbSchedulerInit = new tbb::task_scheduler_init( m_numThreads );`
			`#endif`
			`return m_numThreads;`
			`}`

			`void init()`
			`{`
			`if (m_numThreads == 0)`
			`{`
			`#if USE_PPL`
			`setApi( apiPpl );`
			`#endif`
			`#if USE_TBB`
			`setApi( apiTbb );`
			`#endif`
			`#if USE_OPENMP`
			`setApi( apiOpenMP );`
			`#endif`
			`setNumThreads(getMaxNumThreads());`
			`}`
			`else`
			`{`
			`setNumThreads(m_numThreads);`
			`}`
			`}`

			`void shutdown()`
			`{`
			`#if USE_TBB`
			`if ( m_tbbSchedulerInit )`
			`{`
			`delete m_tbbSchedulerInit;`
			`m_tbbSchedulerInit = NULL;`
			`}`
			`#endif`
			`}`

			`private:`
			`Api m_api;`
			`int m_numThreads;`
			`#if USE_TBB`
			`tbb::task_scheduler_init* m_tbbSchedulerInit;`
			`#endif // #if USE_TBB`
			`};`

			`extern TaskManager gTaskMgr;`


fix many warnings remove btMultiSapBroadphase.* make collisionFilterGroup/collisionFilterMark int (instead of short int) 2017-01-16 06:26:11 +00:00			`inline static void initTaskScheduler()`
MultiThreaded Demo: - fixing various race conditions throughout (usage of static vars, etc) - addition of a few lightweight mutexes (which are compiled out by default) - slight code rearrangement in discreteDynamicsWorld to facilitate multithreading - PoolAllocator::allocate() can now be called when pool is full without crashing (null pointer returned) - PoolAllocator allocate and freeMemory, are OPTIONALLY threadsafe (default is un-threadsafe) - CollisionDispatcher no longer checks if the pool allocator is full before calling allocate(), instead it just calls allocate() and checks if the return is null -- this avoids a race condition - SequentialImpulseConstraintSolver OPTIONALLY uses different logic in getOrInitSolverBody() to avoid a race condition with kinematic bodies - addition of 2 classes which together allow simulation islands to be run in parallel: - btSimulationIslandManagerMt - btDiscreteDynamicsWorldMt - MultiThreadedDemo example in the example browser demonstrating use of OpenMP, Microsoft PPL, and Intel TBB - use multithreading for other demos - benchmark demo: add parallel raycasting 2016-09-27 07:01:45 +00:00			`{`
			`gTaskMgr.init();`
			`}`

fix many warnings remove btMultiSapBroadphase.* make collisionFilterGroup/collisionFilterMark int (instead of short int) 2017-01-16 06:26:11 +00:00			`inline static void cleanupTaskScheduler()`
MultiThreaded Demo: - fixing various race conditions throughout (usage of static vars, etc) - addition of a few lightweight mutexes (which are compiled out by default) - slight code rearrangement in discreteDynamicsWorld to facilitate multithreading - PoolAllocator::allocate() can now be called when pool is full without crashing (null pointer returned) - PoolAllocator allocate and freeMemory, are OPTIONALLY threadsafe (default is un-threadsafe) - CollisionDispatcher no longer checks if the pool allocator is full before calling allocate(), instead it just calls allocate() and checks if the return is null -- this avoids a race condition - SequentialImpulseConstraintSolver OPTIONALLY uses different logic in getOrInitSolverBody() to avoid a race condition with kinematic bodies - addition of 2 classes which together allow simulation islands to be run in parallel: - btSimulationIslandManagerMt - btDiscreteDynamicsWorldMt - MultiThreadedDemo example in the example browser demonstrating use of OpenMP, Microsoft PPL, and Intel TBB - use multithreading for other demos - benchmark demo: add parallel raycasting 2016-09-27 07:01:45 +00:00			`{`
			`gTaskMgr.shutdown();`
			`}`


			`#if USE_TBB`
			`///`
			`/// TbbBodyAdapter -- Converts a body object that implements the`
			`/// "forLoop(int iBegin, int iEnd) const" function`
			`/// into a TBB compatible object that takes a tbb::blocked_range<int> type.`
			`///`
			`template <class TBody>`
			`struct TbbBodyAdapter`
			`{`
			`const TBody* mBody;`

			`void operator()( const tbb::blocked_range<int>& range ) const`
			`{`
			`mBody->forLoop( range.begin(), range.end() );`
			`}`
			`};`
			`#endif // #if USE_TBB`

			`#if USE_PPL`
			`///`
			`/// PplBodyAdapter -- Converts a body object that implements the`
			`/// "forLoop(int iBegin, int iEnd) const" function`
			`/// into a PPL compatible object that implements "void operator()( int ) const"`
			`///`
			`template <class TBody>`
			`struct PplBodyAdapter`
			`{`
			`const TBody* mBody;`
			`int mGrainSize;`
			`int mIndexEnd;`

			`void operator()( int i ) const`
			`{`
			`mBody->forLoop( i, (std::min)(i + mGrainSize, mIndexEnd) );`
			`}`
			`};`
			`#endif // #if USE_PPL`


			`///`
			`/// parallelFor -- interface for submitting work expressed as a for loop to the worker threads`
			`///`
			`template <class TBody>`
			`void parallelFor( int iBegin, int iEnd, int grainSize, const TBody& body )`
			`{`
			`#if USE_OPENMP`
			`if ( gTaskMgr.getApi() == TaskManager::apiOpenMP )`
			`{`
			`#pragma omp parallel for schedule(static, 1)`
			`for ( int i = iBegin; i < iEnd; i += grainSize )`
			`{`
			`body.forLoop( i, (std::min)( i + grainSize, iEnd ) );`
			`}`
			`return;`
			`}`
			`#endif // #if USE_OPENMP`

			`#if USE_PPL`
			`if ( gTaskMgr.getApi() == TaskManager::apiPpl )`
			`{`
			`// PPL dispatch`
			`PplBodyAdapter<TBody> pplBody;`
			`pplBody.mBody = &body;`
			`pplBody.mGrainSize = grainSize;`
			`pplBody.mIndexEnd = iEnd;`
			`// note: MSVC 2010 doesn't support partitioner args, so avoid them`
			`concurrency::parallel_for( iBegin,`
			`iEnd,`
			`grainSize,`
			`pplBody`
			`);`
			`return;`
			`}`
			`#endif //#if USE_PPL`

			`#if USE_TBB`
			`if ( gTaskMgr.getApi() == TaskManager::apiTbb )`
			`{`
			`// TBB dispatch`
			`TbbBodyAdapter<TBody> tbbBody;`
			`tbbBody.mBody = &body;`
			`tbb::parallel_for( tbb::blocked_range<int>( iBegin, iEnd, grainSize ),`
			`tbbBody,`
			`tbb::simple_partitioner()`
			`);`
			`return;`
			`}`
			`#endif // #if USE_TBB`

			`{`
			`// run on main thread`
			`body.forLoop( iBegin, iEnd );`
			`}`

			`}`