/*
								+---------------------------------+
								|                                 |
								| ***   Parallel primitives   *** |
								|                                 |
								|  Copyright   -tHE SWINe- 2013  |
								|                                 |
								|           Parallel.h            |
								|                                 |
								+---------------------------------+
*/

#pragma once
#ifndef __PARALLEL_PRIMITIVES_INCLUDED
#define __PARALLEL_PRIMITIVES_INCLUDED

/**
 *	@file Parallel.h
 *	@brief simple parallel primitives
 *	@date 2013
 *	@author -tHE SWINe-
 *
 *
 *	@date 2013-05-08
 *
 *	Added CParallelLoop_NoCopy and CThreadParallelLoop_NoCopy to make the parallel
 *	loops easier to use.
 *
 */

#include "Integer.h"
#include <stdexcept>
#if defined(_WIN32) || defined(_WIN64)
#define NOMINMAX
#include <windows.h>
#else // _WIN32 || _WIN64
#include <pthread.h>
#include <semaphore.h>
#include <sched.h>
#endif // _WIN32 || _WIN64

#if defined(_WIN32) || defined(_WIN64)

#include "Parallel.inl"

/**
 *	@brief base class for atomic operations
 *	@tparam _Ty is data type to perform atomic operations on
 */
template <class _Ty>
class CAtomicBase : public __winatom::CAtomicBase_Inner<typename
	__winatom::CChooseSupportedIntType<sizeof(_Ty)>::_TyResult> {
};

#else // _WIN32 || _WIN64

#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
#include <stdatomic.h> // old g++ has to rely on stdatomic.h, new has built-in extensions
#endif // __GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))

/**
 *	@brief base class for atomic operations
 *	@tparam _Ty is data type to perform atomic operations on
 */
template <class _Ty>
class CAtomicBase {
public:
	/**
	 *	@brief exchanges value of the counter with n_value
	 *	@note On some platforms, n_value must be 1.
	 */
	static inline _Ty n_Atomic_Exchange(_Ty *p_target, _Ty n_value)
	{
#if defined(__GNUC__) && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
		_ASSERTE(n_value == 1);
		// Many targets have only minimal support for such locks, and do not support
		// a full exchange operation. In this case, a target may support reduced
		// functionality here by which the only valid value to store is the immediate
		// constant 1. The exact value actually stored in *ptr is implementation defined.

		return __sync_lock_test_and_set(p_target, n_value); // new G++
#else // __GNUC__ && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
		return atomic_exchange(p_target, n_value); // old G++
#endif // __GNUC__ && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
	}

	/**
	 *	@brief exchanges value of the counter with n_value
	 *	@note On some platforms, n_value must be 1.
	 */
	static inline void Atomic_Exchange_0(_Ty *p_target)
	{
#if defined(__GNUC__) && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
		__sync_lock_release(p_target); // new G++
#else // __GNUC__ && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
		atomic_exchange(p_target, 0); // old G++
#endif // __GNUC__ && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
	}

	/**
	 *	@brief atomically (pre) increments the counter and returns the value
	 */
	static inline _Ty n_Atomic_Increment(_Ty *p_target)
	{
#if defined(__GNUC__) && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
		return __sync_add_and_fetch(p_target, 1); // new G++
#else // __GNUC__ && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
		return atomic_fetch_add(p_target, 1) - 1; // old G++
#endif // __GNUC__ && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
	}

	/**
	 *	@brief atomically (pre) decrements the counter and returns the value
	 */
	static inline _Ty n_Atomic_Decrement(_Ty *p_target)
	{
#if defined(__GNUC__) && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
		return __sync_sub_and_fetch(p_target, 1); // new G++
#else // __GNUC__ && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
		return atomic_fetch_sub(p_target, 1) + 1; // old G++
#endif // __GNUC__ && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
	}

	/**
	 *	@brief atomically adds a value to the counter and returns the original value
	 */
	static inline _Ty n_Atomic_FetchAdd(_Ty *p_target, _Ty n_value)
	{
#if defined(__GNUC__) && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
		return __sync_fetch_and_add(p_target, n_value); // new G++
#else // __GNUC__ && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
		return atomic_fetch_add(p_target, n_value); // old G++
#endif // __GNUC__ && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
	}

	/**
	 *	@brief atomically subtracts a value from the counter and returns the original value
	 */
	static inline _Ty n_Atomic_FetchSubtract(_Ty *p_target, _Ty n_value)
	{
#if defined(__GNUC__) && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
		return __sync_fetch_and_sub(p_target, 1); // new G++
#else // __GNUC__ && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
		return atomic_fetch_sub(p_target, 1); // old G++
#endif // __GNUC__ && !(__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
	}
};

#endif // _WIN32 || _WIN64

/**
 *	@brief a simple atomic counter
 */
template <class _Ty = int32_t>
class CAtomicCounter { // todo make it a template, template windows implementation away, add more functions
public:
#if defined(_WIN32) || defined(_WIN64)
	typedef _Ty _TyCounter; /**< @brief type of the counter */
	typedef typename __winatom::CChooseSupportedIntType<sizeof(_Ty)>::_TyResult _TyCounterRep; /**< @brief type of the counter */
	typedef CAtomicBase<_TyCounterRep> _TyBase; /**< @brief base implementation of the atomics */
#else // _WIN32 || _WIN64
	typedef _Ty _TyCounter; /**< @brief type of the counter */
	typedef _Ty _TyCounterRep; /**< @brief type of the counter */
	typedef CAtomicBase<_Ty> _TyBase; /**< @brief base implementation of the atomics */
#endif // _WIN32 || _WIN64

protected:
	_TyCounterRep m_p_counter_and_alignment[2]; /**< @brief data storage */
	_TyCounterRep *m_p_counter_addr; /**< @brief pointer to the counter (aligned even on stack) */

public:
	/**
	 *	@brief default constructor; aligns the counter on stack
	 */
	inline CAtomicCounter()
	{
		uint8_t *p_addr = (uint8_t*)&m_p_counter_and_alignment;
		_ASSERTE(b_Is_POT(sizeof(_TyCounterRep))); // should be power of two
		if(ptrdiff_t(p_addr) & (sizeof(_TyCounterRep) - 1))
			p_addr += sizeof(_TyCounterRep) - ptrdiff_t(p_addr) & (sizeof(_TyCounterRep) - 1);
		m_p_counter_addr = (_TyCounterRep*)p_addr;
		_ASSERTE(ptrdiff_t(m_p_counter_addr) % sizeof(_TyCounterRep) == 0); // make sure it is aligned
		_ASSERTE(m_p_counter_addr >= &m_p_counter_and_alignment[0]);
		_ASSERTE(m_p_counter_addr <= &m_p_counter_and_alignment[1]); // make sure it is in array bounds
	}

	/**
	 *	@brief constructor; aligns the counter on stack and initializes the counter
	 *	@param[in] n_value is the initial value of the counter
	 *	@note The initialization is not atomic by itself.
	 */
	inline CAtomicCounter(_Ty n_value)
	{
		uint8_t *p_addr = (uint8_t*)&m_p_counter_and_alignment;
		_ASSERTE(b_Is_POT(sizeof(_TyCounterRep))); // should be power of two
		if(ptrdiff_t(p_addr) & (sizeof(_TyCounterRep) - 1))
			p_addr += sizeof(_TyCounterRep) - ptrdiff_t(p_addr) & (sizeof(_TyCounterRep) - 1);
		m_p_counter_addr = (_TyCounterRep*)p_addr;
		_ASSERTE(ptrdiff_t(m_p_counter_addr) % sizeof(_TyCounterRep) == 0); // make sure it is aligned
		_ASSERTE(m_p_counter_addr >= &m_p_counter_and_alignment[0]);
		_ASSERTE(m_p_counter_addr <= &m_p_counter_and_alignment[1]); // make sure it is in array bounds

		*m_p_counter_addr = n_value;
	}

	/**
	 *	@brief gets counter value (not thread safe)
	 */
	inline _TyCounterRep &n_Value()
	{
		return *m_p_counter_addr;
	}

	/**
	 *	@brief gets counter value (not thread safe)
	 */
	inline _TyCounter n_Value() const
	{
		return _TyCounter(*m_p_counter_addr);
	}

	/**
	 *	@brief exchanges value of the counter with n_value
	 *	@note On some platforms, n_value must be 1.
	 */
	inline _TyCounter n_Atomic_Exchange(_TyCounter n_value)
	{
		return _TyCounter(_TyBase::n_Atomic_Exchange(m_p_counter_addr, _TyCounterRep(n_value)));
	}

	/**
	 *	@brief exchanges value of the counter with n_value
	 *	@note On some platforms, n_value must be 1.
	 */
	inline void Atomic_Exchange_0()
	{
		_TyBase::Atomic_Exchange_0(m_p_counter_addr);
	}

	/**
	 *	@brief atomically (pre) increments the counter and returns the value
	 */
	inline _TyCounter n_Atomic_Increment()
	{
		return _TyCounter(_TyBase::n_Atomic_Increment(m_p_counter_addr));
	}

	/**
	 *	@brief atomically (pre) decrements the counter and returns the value
	 */
	inline _TyCounter n_Atomic_Decrement()
	{
		return _TyCounter(_TyBase::n_Atomic_Decrement(m_p_counter_addr));
	}

	/**
	 *	@brief atomically adds a value to the counter and returns the original value
	 */
	inline _TyCounter n_Atomic_FetchAdd(_TyCounter n_value)
	{
		return _TyCounter(_TyBase::n_Atomic_FetchAdd(m_p_counter_addr, _TyCounterRep(n_value)));
	}

	/**
	 *	@brief atomically subtracts a value from the counter and returns the original value
	 */
	inline _TyCounter n_Atomic_FetchSubtract(_TyCounter n_value)
	{
		return _TyCounter(_TyBase::n_Atomic_FetchSubtract(m_p_counter_addr, _TyCounterRep(n_value)));
	}

protected:
	CAtomicCounter(const CAtomicCounter &UNUSED(r_counter)) {} /**< @brief can't copy counters this way, use pointers instead */
	const CAtomicCounter &operator =(const CAtomicCounter &UNUSED(r_counter)) { return *this; } /**< @brief can't copy counters this way, use pointers instead */
};

/**
 *	@brief a simple implementation of spinlock
 */
class CSpinLock {
protected:
	CAtomicCounter<> m_atomic; /**< @brief atomic counter that serves as the lock */

public:
	/**
	 *	@brief default constructor; initializes the lock as unlocked
	 */
	inline CSpinLock()
	{
		m_atomic.n_Value() = 0;
		// the lock is initially unlocked
	}

	/**
	 *	@brief acquires the lock, blocks if the lock is already acquired by other thread
	 *
	 *	@note This does not put the thread to sleep as CMutex, instead it actively loops.
	 *	@note Always eventually succeeds (may not be in finite time though).
	 */
	inline void Lock()
	{
		while(m_atomic.n_Atomic_Exchange(1))
			;
		// loop until we manage to exchange zero (counter was free)
	}

	/**
	 *	@brief attempts to acquire the lock. in case it is already locked, returns immediately
	 *	@return Returns true in case lock was successfuly acquired, false in case the lock
	 *		was already locked by another thread.
	 */
	inline bool TryLock()
	{
		return !m_atomic.n_Atomic_Exchange(1);
		// if we exchanged zero, the counter was free and we just acquired it
	}

	/**
	 *	@brief releases the lock, returns immediately
	 */
	inline void Unlock()
	{
		m_atomic.Atomic_Exchange_0();
		// exchange with zero to set the counter to free
	}

protected:
	CSpinLock(const CSpinLock &UNUSED(r_lock)) {} /**< @brief can't copy locks this way, use pointers instead */
	const CSpinLock &operator =(const CSpinLock &UNUSED(r_lock)) { return *this; } /**< @brief can't copy locks this way, use pointers instead */
};

#include "Thread.h"

/**
 *	@brief a simple parallel loop model, using native threads
 *	@note This implementation allows for setting arbitrary
 *		number of threads (limited only by memory).
 */
template <class CFunctor>
class CThreadParallelLoop {
protected:
	class CCaller : public CRunable {
	protected:
		CSemaphore *m_p_worker_sema;
		CSemaphore *m_p_master_sema;
		bool m_b_quit;
		bool m_b_caught_exception;
		std::exception m_exception;
		CThread m_thread;
		size_t m_n_thread_id;
		size_t m_n_thread_num;
		CFunctor *m_p_context;
		CAtomicCounter<> *m_p_counter;

	public:
		inline CCaller() throw()
			:m_b_caught_exception(false) // !!
		{
			m_thread.AttachRunable(*this);
		}

		bool Start(size_t n_thread_id, size_t n_thread_num, CFunctor *p_context,
			CSemaphore &r_worker_sema, CSemaphore &r_master_sema, CAtomicCounter<> &r_counter) throw()
		{
			m_b_quit = false;
			m_b_caught_exception = false;
			m_n_thread_id = n_thread_id;
			m_n_thread_num = n_thread_num;
			m_p_context = p_context;
			m_p_worker_sema = &r_worker_sema;
			m_p_master_sema = &r_master_sema;
			m_p_counter = &r_counter;

			return m_thread.Start();
		}

		inline void Announce() throw()
		{
			m_b_quit = true;
			//m_p_worker_sema->Signal(); // not here!! (might wake up a different thread for which m_b_quit is not set)
		}

		bool WaitForFinish() throw()
		{
			if(!m_b_quit)
				return false; // need to Announce() first
			return m_thread.Stop(); // note that it might raise the exception
		}

		inline bool b_Threw() const throw()
		{
			return m_b_caught_exception;
		}

		inline std::exception GetException() throw()
		{
			_ASSERTE(m_b_caught_exception); // don't get exception if it did not throw
			m_b_caught_exception = false; // exception handled, ready to go on
			return m_exception;
		}

	protected:
		inline virtual void Run() throw()
		{
			if(m_n_thread_num % CThread::n_CPU_Num() == 0) {
				CCurrentThreadHandle h;
				//h.Set_HighPriority();
				h.Set_AffinityMask32(1 << m_n_thread_id); // fix thread affinities (only if using all cores)
			}

			try {
				while(!m_b_quit) {
					(*m_p_context)(m_n_thread_id, m_n_thread_num);
					// do the work

					if(!m_p_counter->n_Atomic_Decrement())
						m_p_master_sema->Signal();
					// let the main thread through once all finished

					//printf("worker %d waits\n", m_n_thread_id); // debug
					m_p_worker_sema->Wait();
					//printf("worker %d resumes\n", m_n_thread_id); // debug
					// wait for tha main thread to signal the next iteration
				}
				//printf("worker %d quits\n", m_n_thread_id); // debug
			} catch(std::exception &r_exc) {
				if(!m_p_counter->n_Atomic_Decrement())
					m_p_master_sema->Signal(); // let the main thread through
				// avoid the main thread getting stuck on failure

				m_b_caught_exception = true;
				m_exception = r_exc;
				// take care of exception transport
			}
		}
	};

	CAtomicCounter<> m_counter;
	size_t m_n_thread_num;
	bool m_b_first_time;
	CFunctor m_context;
	CSemaphore m_main_sema, m_worker_sema;
	CCaller *m_p_caller;

public:
	inline CThreadParallelLoop(size_t n_thread_num = CThread::n_CPU_Num()) // throw(std::bad_alloc)
		:m_n_thread_num(n_thread_num), m_b_first_time(true),
		m_main_sema(0), m_worker_sema(0)
	{
		m_p_caller = new CCaller[n_thread_num - 1];
		// alloc threads
	}

	inline ~CThreadParallelLoop() // throw(std::exception)
	{
		Quit();
		// can do, threads succeed in stopping if they are not running, no errors should be generated

		if(m_p_caller)
			delete[] m_p_caller;
		// delete workers
	}

	bool Run(CFunctor function) // throw(std::exception)
	{
		m_context = function; // the address of the context needs to exist even after returning from this function
		_ASSERTE(m_n_thread_num - 1 <= CMaxIntValue<CAtomicCounter<>::_TyCounter>::result());
		m_counter.n_Value() = CAtomicCounter<>::_TyCounter(m_n_thread_num - 1); // workers decrement this, the last one signals the barrier
		if(m_b_first_time) {
			if(m_n_thread_num > 1 && !m_p_caller)
				return false;
			for(size_t i = 1; i < m_n_thread_num; ++ i) {
				if(!m_p_caller[i - 1].Start(i, m_n_thread_num, &m_context,
				   m_worker_sema, m_main_sema, m_counter)) {
					if(i > 1) {
						for(size_t j = 1; j < i; ++ j)
							m_p_caller[j - 1].Announce();
						_ASSERTE(i - 1 <= INT_MAX);
						m_worker_sema.Signal(int(i - 1)); // *after* the loop
						for(size_t j = 1; j < i; ++ j)
							m_p_caller[j - 1].WaitForFinish();
						// stop already running workers (note that this might deadlock
						// if the workers have some sync inside the user code)

						RethrowWorkerExceptions(i);
						// handle exceptions
					}
					// handle running workers

					return false;
				}
			}
			// start the workers

			m_b_first_time = false; // !!
		} else {
			_ASSERTE(m_n_thread_num - 1 <= INT_MAX);
			m_worker_sema.Signal(int(m_n_thread_num - 1));
			// let the workers go through again
		}

		if(m_n_thread_num % CThread::n_CPU_Num() == 0) {
			CCurrentThreadHandle h;
			//h.Set_HighPriority();
			h.Set_AffinityMask32(1);
			// test - set thread 0 mask as well
		}

		m_context(size_t(0), m_n_thread_num);
		// run thread 0 as well

		if(m_n_thread_num % CThread::n_CPU_Num() == 0) {
			CCurrentThreadHandle h;
			//h.Set_NormalPriority();
			h.Set_AffinityMask32(UINT32_MAX);
		}

		m_main_sema.Wait();
		// wait for the last worker to signal ready before going through

		RethrowWorkerExceptions(m_n_thread_num);
		// handle exception transport

		return true;
	}

	bool Quit() // throw(std::exception)
	{
		if(m_n_thread_num % CThread::n_CPU_Num() == 0) {
			CCurrentThreadHandle h;
			//h.Set_NormalPriority();
			h.Set_AffinityMask32(UINT32_MAX);
			// put it back // todo - make this an optional function
		}

		if(m_n_thread_num > 1 && !m_p_caller)
			return false;
		for(size_t i = 1; i < m_n_thread_num; ++ i)
			m_p_caller[i - 1].Announce();
		_ASSERTE(m_n_thread_num - 1 <= INT_MAX);
		m_worker_sema.Signal(int(m_n_thread_num - 1)); // *after* the loop
		// signal all the workers to resume (and quit)

		int n_result = 0;
		for(size_t i = 1; i < m_n_thread_num; ++ i)
			n_result |= (m_p_caller[i - 1].WaitForFinish())? 0 : 1; // must *always* go
		// join all workers' threads and make notes of exceptions

		RethrowWorkerExceptions(m_n_thread_num);
		// take care of exception transport

		m_b_first_time = true;
		// will start the threads anew next time

		return !n_result;
	}

protected:
	void RethrowWorkerExceptions(size_t n_thread_num) // throw(std::exception)
	{
		_ASSERTE(m_p_caller);
		bool b_caught = false;
		std::exception t_exception;
		for(size_t i = 1; i < n_thread_num; ++ i) {
			if(m_p_caller[i - 1].b_Threw()) {
				b_caught = true;
				t_exception = m_p_caller[i - 1].GetException();
			}
		}
		if(b_caught)
			throw t_exception; // rethrow
		// handle exceptions
	}

	CThreadParallelLoop(const CThreadParallelLoop &UNUSED(r_loop)) {} /**< @brief can't copy loops this way, use pointers instead */
	const CThreadParallelLoop &operator =(const CThreadParallelLoop &UNUSED(r_loop)) { return *this; } /**< @brief can't copy loops this way, use pointers instead */
};

#ifdef _OPENMP

#include <omp.h>

/**
 *	@brief a simple parallel loop model, using OpemnMP
 */
template <class CFunctor>
class CParallelLoop {
protected:
	size_t m_n_thread_num;

public:
	inline CParallelLoop(size_t n_thread_num)
		:m_n_thread_num(n_thread_num)
	{
		_ASSERTE(m_n_thread_num <= INT_MAX); // alert caller early
	}

	inline bool Run(CFunctor function) // throw(std::exception)
	{
		_ASSERTE(m_n_thread_num <= INT_MAX);
		omp_set_num_threads(int(m_n_thread_num));
		// set number of threads

		#pragma omp parallel
		{
			size_t n_thread = omp_get_thread_num();
			function(n_thread, omp_get_num_threads());
		}

		return true;
	}

	inline bool Quit()
	{
		return true;
	}
};

#else // _OPENMP

/**
 *	@brief a simple parallel loop model, using native threads
 */
template <class CFunctor>
class CParallelLoop : public CThreadParallelLoop<CFunctor> {
public:
	inline CParallelLoop(size_t n_thread_num = CThread::n_CPU_Num())
		:CThreadParallelLoop<CFunctor>(n_thread_num)
	{}
};

#endif // _OPENMP

/**
 *	@brief a simple wrapper for parallel functor that are not copy-able
 *	@tparam CFunctor is a parallel functor
 */
template <class CFunctor>
class CNoCopyParallelFunctorWrapper {
protected:
	CFunctor *m_p_fun;

public:
	inline CNoCopyParallelFunctorWrapper()
		:m_p_fun(0)
	{}

	inline CNoCopyParallelFunctorWrapper(CFunctor &r_functor)
		:m_p_fun(&r_functor)
	{}

	inline void operator ()(size_t n_thread_id, size_t n_thread_num)
	{
		(*m_p_fun)(n_thread_id, n_thread_num);
	}
};

/**
 *	@brief a simple parallel loop model, for objects that are not copy-able
 *	@tparam CFunctor is a parallel functor (not copy-able object)
 */
template <class CFunctor>
class CParallelLoop_NoCopy : public CParallelLoop<CNoCopyParallelFunctorWrapper<CFunctor> > {
public:
	inline CParallelLoop_NoCopy(size_t n_thread_num = CThread::n_CPU_Num())
		:CParallelLoop<CNoCopyParallelFunctorWrapper<CFunctor> >(n_thread_num)
	{}

	inline bool Run(CFunctor &r_function) // throw(std::exception)
	{
		return CParallelLoop<CNoCopyParallelFunctorWrapper<CFunctor> >::Run(
			CNoCopyParallelFunctorWrapper<CFunctor>(r_function));
	}
};

/**
 *	@brief a simple threaded parallel loop model, for objects that are not copy-able
 *	@tparam CFunctor is a parallel functor (not copy-able object)
 */
template <class CFunctor>
class CThreadParallelLoop_NoCopy : public CThreadParallelLoop<CNoCopyParallelFunctorWrapper<CFunctor> > {
public:
	inline CThreadParallelLoop_NoCopy(size_t n_thread_num = CThread::n_CPU_Num())
		:CThreadParallelLoop<CNoCopyParallelFunctorWrapper<CFunctor> >(n_thread_num)
	{}

	inline bool Run(CFunctor &r_function) // throw(std::exception)
	{
		return CThreadParallelLoop<CNoCopyParallelFunctorWrapper<CFunctor> >::Run(
			CNoCopyParallelFunctorWrapper<CFunctor>(r_function));
	}
};

#endif // !__PARALLEL_PRIMITIVES_INCLUDED
