/*
								+--------------------------------+
								|                                |
								|   ***  OpenCL utilities  ***   |
								|                                |
								|  Copyright  -tHE SWINe- 2010  |
								|                                |
								|           ClUtils.h            |
								|                                |
								+--------------------------------+
*/

#pragma once
#ifndef __CUDA_UTILS_INCLUDED
#define __CUDA_UTILS_INCLUDED

/**
 *	@file gpgpu/ClUtils.h
 *	@author -tHE SWINe-
 *	@brief OpenCL utilities
 *	@date 2010
 *
 *	@date 2010-08-10
 *
 *	changed OpenCL program binaries size in CCLUtils from uint64_T to size_t
 *	as visual studio 2008 has some probles with new[uint64_t].
 *
 *	@date 2010-08-13
 *
 *	added CCLProgramCompiler::Get_BuildLog() to enable build error reporting
 *
 *	added new CCLDeviceParams::CCLDeviceParams() taking just cl_device_id device handle
 *
 *	added static version of CCLDeviceParams::b_IsExtensionSupported(), enabling caller
 *	to determine wheter particular device supports given extension without the need to
 *	instantiate the whole CCLDeviceParams.
 *
 *	added static version of CCLDeviceParams::n_GetDeviceInfoString(), enabling caller
 *	to get device info string (eg. the device name) without the need to instantiate
 *	the whole CCLDeviceParams.
 *
 *	added static CCLDeviceParams::n_Query_DeviceProperties(), enabling caller to get
 *	device properties structure without the need to instantiate the whole CCLDeviceParams.
 *
 *	@date 2010-09-28
 *
 *	renamed clParamsSet*() to clSetKernelArgs*() (clParamSet() is CUDA function name
 *	while clSetKernelArg() is OpenCL function name).
 *
 *	added CCLLocalMem so it is possible to specify local memory buffers for kernels.
 *	consider the following code example use:
 *
 *	@code
 *	cl_kernel h_kernel;
 *	cl_mem p_dest, p_src;
 *	size_t n_local_block_size = 128;
 *	clSetKernelArgs(h_kernel, p_dest, p_src, CCLLocalMem(n_local_block_size * sizeof(float)));
 *	// each thread block will have 128 floats (512 bytes) of local memory@endcode
 *
 *	@date 2010-11-01
 *
 *	Added a new storage format for program binaries trough CCLProgramStorage. It enables saving
 *	several versions of the same program with different compiler configuration (different defines).
 *	It avoids blowing file size by employing LRU-type scheme to limit number of versions saved.
 *
 *	@date 2010-12-05
 *
 *	Added device driver to device name in CCLProgramStorage. Loading older binaries using newer
 *	driver causes problems (OpenCL return value -36 when launching the kernel) as the driver
 *	doesn't seem to recognize them as old binaries.
 *
 *	@date 2011-10-21
 *
 *	Fixed one position of \#endif // CL_PROGRAM_COMPILER_DISPLAY_SUCCESSFUL_BUILD_WARNINGS
 *	which triggered build errors in cases if CL_PROGRAM_COMPILER_DISPLAY_SUCCESSFUL_BUILD_WARNINGS
 *	macro was not defined.
 *
 *	@date 2012-03-17
 *
 *	Added fopen_s() variants of file i/o in CCLProgramCompiler and CCLProgramStorage to avoid
 *	CRT deprecation warnins in the newer versions of Visual Studio.
 *
 *	@date 2012-06-19
 *
 *	Moved multiple inclusion guard before file documentation comment.
 *
 *	@date 2013-08-30
 *
 *	Added code to get SIMD width on non-NVIDIA platforms. Seems to work well with Intel Xeon / MIC.
 *
 *	@date 2013-11-13
 *
 *	Added support for more than 4 GB of device memory to CCLDeviceParams.
 *
 *	@date 2013-09-11
 *
 *	Added support for compressed programs in CCLUniqueProgram (via CCLUniqueProgram::from_compressed
 *	tag), refactored CCLUniqueProgram members to start with the "m_" prefix.
 *
 */

#ifdef __OPENCL_CL_H
#pragma message("warning: CL/cl.h does not need to be included in files using ClUtils.h")
// this avoids errors of stdint.h not being included
#endif // __OPENCL_CL_H

#include <vector>
#include <algorithm>
#if !defined(_MSC_VER) || defined(__MWERKS__) || _MSC_VER > 1200
#include <stdint.h>
#else // !_MSC_VER || __MWERKS__ || _MSC_VER > 1200
#include <stddef.h>
typedef ptrdiff_t intptr_t;
#endif // !_MSC_VER || __MWERKS__ || _MSC_VER > 1200
#include <CL/opencl.h>
#include "../Hash.h"
#include "../StlUtils.h"
#include "../StdIOUtils.h"

/**
 *	@def CL_PROGRAM_COMPILER_DISPLAY_BUILD_ERRORS
 *	@brief if defined, CCLProgramCompiler::n_CompileProgram() and CCLProgramCompiler::n_CompileProgramFile() show build log after unsuccessful build
 */
#define CL_PROGRAM_COMPILER_DISPLAY_BUILD_ERRORS

/**
 *	@def CL_PROGRAM_COMPILER_DISPLAY_SUCCESSFUL_BUILD_WARNINGS
 *	@brief if defined, CCLProgramCompiler::n_CompileProgram() and CCLProgramCompiler::n_CompileProgramFile() show build log even after successful build (handy for debugging)
 */
//#define CL_PROGRAM_COMPILER_DISPLAY_SUCCESSFUL_BUILD_WARNINGS

/**
 *	@def CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT
 *	@brief if defined, the old simple binary format is used to store compiled program binaries, instead of CCLProgramStorage
 */
//#define CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT

/**
 *	@def CL_PROGRAM_COMPILER_PRECAUTIOUS_BINARIES_SAVE
 *	@brief if defined, program binarise are first written to temporary file and then copied over the old binaries, this gives possiblity of falling back to older binaries in case writing failed ... but it is overkill for most applications
 *	@note This is only in effect if CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT is not defined.
 */
//#define CL_PROGRAM_COMPILER_PRECAUTIOUS_BINARIES_SAVE

/**
 *	@brief OpenCL error codes enums for easier debugging
 */
enum ECLErrorCode {
	cl_Success = CL_SUCCESS, // 0
	cl_Device_Not_Found = CL_DEVICE_NOT_FOUND, // -1
	cl_Device_Not_Available = CL_DEVICE_NOT_AVAILABLE, // -2
	cl_Compiler_Not_Available = CL_COMPILER_NOT_AVAILABLE, // -3
	cl_Mem_Object_Allocation_Failure = CL_MEM_OBJECT_ALLOCATION_FAILURE, // -4
	cl_Out_Of_Resources = CL_OUT_OF_RESOURCES, // -5
	cl_Out_Of_Host_Memory = CL_OUT_OF_HOST_MEMORY, // -6
	cl_Profiling_Info_Not_Available = CL_PROFILING_INFO_NOT_AVAILABLE, // -7
	cl_Mem_Copy_Overlap = CL_MEM_COPY_OVERLAP, // -8
	cl_Image_Format_Mismatch = CL_IMAGE_FORMAT_MISMATCH, // -9
	cl_Image_Format_Not_Supported = CL_IMAGE_FORMAT_NOT_SUPPORTED, // -10
	cl_Build_Program_Failure = CL_BUILD_PROGRAM_FAILURE, // -11
	cl_Map_Failure = CL_MAP_FAILURE, // -12
	cl_Misaligned_Sub_Buffer_Offset = CL_MISALIGNED_SUB_BUFFER_OFFSET, // -13
	cl_Exec_Status_Error_For_Events_In_Wait_List = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST, // -14
	cl_Invalid_Value = CL_INVALID_VALUE, // -30
	cl_Invalid_Device_Type = CL_INVALID_DEVICE_TYPE, // -31
	cl_Invalid_Platform = CL_INVALID_PLATFORM, // -32
	cl_Invalid_Device = CL_INVALID_DEVICE, // -33
	cl_Invalid_Context = CL_INVALID_CONTEXT, // -34
	cl_Invalid_Queue_Properties = CL_INVALID_QUEUE_PROPERTIES, // -35
	cl_Invalid_Command_Queue = CL_INVALID_COMMAND_QUEUE, // -36
	cl_Invalid_Host_Ptr = CL_INVALID_HOST_PTR, // -37
	cl_Invalid_Mem_OBJECT = CL_INVALID_MEM_OBJECT, // -38
	cl_Invalid_Image_Format_Descriptor = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, // -39
	cl_Invalid_Image_Size = CL_INVALID_IMAGE_SIZE, // -40
	cl_Invalid_Sampler = CL_INVALID_SAMPLER, // -41
	cl_Invalid_Binary = CL_INVALID_BINARY, // -42
	cl_Invalid_Build_Options = CL_INVALID_BUILD_OPTIONS, // -43
	cl_Invalid_Program = CL_INVALID_PROGRAM, // -44
	cl_Invalid_Program_Executable = CL_INVALID_PROGRAM_EXECUTABLE, // -45
	cl_Invalid_Kernel_Name = CL_INVALID_KERNEL_NAME, // -46
	cl_Invalid_Kernel_Definition = CL_INVALID_KERNEL_DEFINITION, // -47
	cl_Invalid_Kernel = CL_INVALID_KERNEL, // -48
	cl_Invalid_Arg_Index = CL_INVALID_ARG_INDEX, // -49
	cl_Invalid_Arg_Value = CL_INVALID_ARG_VALUE, // -50
	cl_Invalid_Arg_Size = CL_INVALID_ARG_SIZE, // -51
	cl_Invalid_Kernel_Args = CL_INVALID_KERNEL_ARGS, // -52
	cl_Invalid_Work_Dimension = CL_INVALID_WORK_DIMENSION, // -53
	cl_Invalid_Work_Group_Size = CL_INVALID_WORK_GROUP_SIZE, // -54
	cl_Invalid_Work_Item_Size = CL_INVALID_WORK_ITEM_SIZE, // -55
	cl_Invalid_Global_Offset = CL_INVALID_GLOBAL_OFFSET, // -56
	cl_Invalid_Event_Wait_List = CL_INVALID_EVENT_WAIT_LIST, // -57
	cl_Invalid_Event = CL_INVALID_EVENT, // -58
	cl_Invalid_Operation = CL_INVALID_OPERATION, // -59
	cl_Invalid_GL_Object = CL_INVALID_GL_OBJECT, // -60
	cl_Invalid_Buffer_Size = CL_INVALID_BUFFER_SIZE, // -61
	cl_Invalid_Mip_Level = CL_INVALID_MIP_LEVEL, // -62
	cl_Invalid_Global_Work_Size = CL_INVALID_GLOBAL_WORK_SIZE, // -63
	cl_Invalid_Property = CL_INVALID_PROPERTY, // -64

	// CL2 error codes
	cl_Compile_Program_Failure = -15, // OpenCL 2.0 CL_COMPILE_PROGRAM_FAILURE
	cl_Linker_Not_Available = -16, // OpenCL 2.0 CL_LINKER_NOT_AVAILABLE
	cl_Link_Program_Failure = -17, // OpenCL 2.0 CL_LINK_PROGRAM_FAILURE
	cl_Device_Partition_Failed = -18, // OpenCL 2.0 CL_DEVICE_PARTITION_FAILED
	cl_Kernel_Arg_Info_Not_Available = -19, // OpenCL 2.0 CL_KERNEL_ARG_INFO_NOT_AVAILABLE
	cl_Invalid_Image_Descriptor = -65, // OpenCL 2.0 CL_INVALID_IMAGE_DESCRIPTOR
	cl_Invalid_Compiler_Options = -66, // OpenCL 2.0 CL_INVALID_COMPILER_OPTIONS
	cl_Invalid_Linker_Options = -67, // OpenCL 2.0 CL_INVALID_LINKER_OPTIONS
	cl_Invalid_Device_Partition_Count = -68, // OpenCL 2.0 CL_INVALID_DEVICE_PARTITION_COUNT
	cl_Invalid_Pipe_Size = -69, // OpenCL 2.0 CL_INVALID_PIPE_SIZE
	cl_Invalid_Device_Queue = -70, // OpenCL 2.0 CL_INVALID_DEVICE_QUEUE

	// Additional Error Codes
	cl_Platform_Not_Found_KHR = -1001, // OpenCL 2.0 CL_PLATFORM_NOT_FOUND_KHR

	// Error codes
	cl_Device_Partition_Failed_EXT = -1057, // OpenCL 2.0 CL_DEVICE_PARTITION_FAILED_EXT
	cl_Invalid_Partition_Count_EXT = -1058, // OpenCL 2.0 CL_INVALID_PARTITION_COUNT_EXT
	cl_Invalid_Partition_Name_EXT = -1059 // OpenCL 2.0 CL_INVALID_PARTITION_NAME_EXT
};

typedef ECLErrorCode CLresult;
// or use cl_int

/**
 *	@brief class with some simple OpenCL utility functions
 */
class CCLUtils {
public:
	/**
	 *	@brief device GFLOPS scoring for device selection
	 */
	struct TDevice_GFLOPS {
		typedef double TObjective; /**< @brief type of scoring objective */

		/**
		 *	@brief device score evaluation
		 *	@param[in] h_device is device to be evaluated
		 *	@return Returns score for the given device.
		 */
		double operator ()(cl_device_id h_device) const;
	};

	/**
	 *	@brief device memory size scoring for device selection
	 */
	struct TDevice_GlobalMemSize {
		typedef cl_ulong TObjective; /**< @brief type of scoring objective */

		/**
		 *	@brief device score evaluation
		 *	@param[in] h_device is device to be evaluated
		 *	@return Returns score for the given device.
		 */
		cl_ulong operator ()(cl_device_id h_device) const;
	};

	/**
	 *	@brief lazily evaluated device GFLOPS scoring for device selection
	 */
	struct TDevice_GFLOPS_Lazy {
		/**
		 *	@brief type of scoring objective
		 */
		struct TObjective {
			mutable double f_gflops; /**< @brief scoring objective */
			cl_device_id h_device; /**< @brief handle to the device */

			/**
			 *	@brief default constructor
			 *	@param[in] _h_device is device handle
			 */
			inline TObjective(cl_device_id _h_device);

			/**
			 *	@brief lazily evaluates the score
			 *	@return Returns score value for the device specified in the constructor.
			 */
			inline double f_LazyEval() const;

			/**
			 *	@brief score equality operator
			 *	@param[in] r_right is score on the right side of the comparison
			 *	@return Returns true if the scores are equal, otherwise returns false.
			 */
			inline bool operator ==(const TObjective &r_right) const;

			/**
			 *	@brief score less-than operator
			 *	@param[in] r_right is score on the right side of the comparison
			 *	@return Returns true if this score is less than the score
			 *		on the right side of the comparison, otherwise returns false.
			 */
			inline bool operator <(const TObjective &r_right) const;
		};

		/**
		 *	@brief device score evaluation
		 *	@param[in] h_device is device to be evaluated
		 *	@return Returns score for the given device.
		 */
		inline TObjective operator ()(cl_device_id h_device) const;
	};

	/**
	 *	@brief primary objective device scoring with fallback to a secondary objective
	 *
	 *	@tparam CPrimaryObjective is primary objective
	 *	@tparam CSecondaryObjective is secondary objective
	 */
	template <class CPrimaryObjective, class CSecondaryObjective>
	struct TDeviceSelect_ChainObjectives {
	protected:
		CPrimaryObjective primary_objective; /**< @brief primary objective configuration */
		CSecondaryObjective secondary_objective; /**< @brief secondary objective configuration */

	public:
		typedef std::pair<typename CPrimaryObjective::TObjective,
			typename CSecondaryObjective::TObjective> TObjective; /**< @brief type of scoring objective */

		/**
		 *	@brief default constructor; specifies the primary objective settings
		 *	@param[in] objective is primary objective instance
		 */
		inline TDeviceSelect_ChainObjectives(CPrimaryObjective objective = CPrimaryObjective(),
			CSecondaryObjective objective2 = CSecondaryObjective());

		/**
		 *	@brief device score evaluation
		 *	@param[in] h_device is device to be evaluated
		 *	@return Returns score for the given device.
		 */
		inline TObjective operator ()(cl_device_id h_device) const;
	};

	/**
	 *	@brief device compute capability scoring for device selection
	 */
	struct TDevice_ComputeCapability {
		typedef int TObjective; /**< @brief type of scoring objective */

		/**
		 *	@brief device score evaluation
		 *	@param[in] h_device is device to be evaluated
		 *	@return Returns score for the given device.
		 */
		int operator ()(cl_device_id h_device) const;
	};

	/**
	 *	@brief device OpenCL C version scoring for device selection
	 */
	struct TDevice_OpenCL_C_Version {
		typedef int TObjective; /**< @brief type of scoring objective */

		/**
		 *	@brief device score evaluation
		 *	@param[in] h_device is device to be evaluated
		 *	@return Returns score for the given device.
		 */
		int operator ()(cl_device_id h_device) const;
	};

	typedef TDeviceSelect_ChainObjectives<TDevice_GFLOPS, TDevice_GlobalMemSize>
		TDevice_DefaultScoring; /** <@brief device GFLOPS then global memory size for device selection */

public:
	/**
	 *	@brief initializes OpenCL
	 *
	 *	@param[out] p_context is pointer where a new OpenCL context handle is written upon successful return
	 *	@param[in] n_device_type is required device type (eg. CL_DEVICE_TYPE_GPU)
	 *	@param[in] b_implementation_profile_selection chooses between
	 *		"OpenCL implementation-specific" profile selection, or profile selection
	 *		based on implemented features
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@deprecated This function is deprecated, as it may cause reduced performance or problems
	 *		on platforms with multiple devices (as all the platform devices are part of the context).
	 */
	static CLresult n_OpenCL_Init(cl_context *p_context, int n_device_type = CL_DEVICE_TYPE_GPU,
		bool b_implementation_profile_selection = false);

	/**
	 *	@brief gets list of available OpenCL platforms
	 *	@param[out] r_platform_list is reference to a vector to hold the platform ids
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	static CLresult n_Get_PlatformList(std::vector<cl_platform_id> &r_platform_list);

	/**
	 *	@brief finds an OpenCL platform with full profile
	 *
	 *	@param[out] p_platform is filled with OpenCL platform id upon successful return
	 *	@param[in] n_device_type is device type hint (if there are more platofrms,
	 *		chooses one with devices of the given type; default CL_DEVICE_TYPE_GPU)
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	static CLresult n_Get_FullProfile_Platform(cl_platform_id *p_platform,
		cl_device_type n_device_type = CL_DEVICE_TYPE_GPU);

	/**
	 *	@brief initializes OpenCL context, based on a list of devices
	 *
	 *	@param[out] p_context is filled with handle to a new OpenCL context upon successful return
	 *	@param[in] h_platform is OpenCL platform id (or null handle for implementation-dependent selection)
	 *	@param[in] n_device_num is number of devices to be associated with the context
	 *	@param[in] p_device is list of devices to be associated with the context
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	static CLresult n_OpenCL_Init(cl_context *p_context, cl_platform_id h_platform,
		size_t n_device_num, const cl_device_id *p_device);

	/**
	 *	@brief gets list of devices available in specified context
	 *
	 *	@param[in] h_context is handle to OpenCL context
	 *	@param[out] r_device_list is list to be filled with device id's
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	static CLresult n_GetDeviceList(cl_context h_context, std::vector<cl_device_id> &r_device_list);

	/**
	 *	@brief gets list of devices available in specified platform
	 *
	 *	@param[in] h_platform is handle to OpenCL platform
	 *	@param[out] r_device_list is list to be filled with device id's
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	static CLresult n_GetDeviceList(cl_platform_id h_platform, std::vector<cl_device_id> &r_device_list,
		cl_device_type n_device_type = CL_DEVICE_TYPE_GPU);

	/**
	 *	@brief gets id of device with best score
	 *
	 *	@tparam CScoringObjective is scoring objective (e.g. TDevice_GFLOPS)
	 *
	 *	@param[out] p_device_id is pointer where id of the most
	 *		powerful device handle is written upon successful return
	 *	@param[in] h_context is handle to OpenCL context
	 *
	 *	@return Returns index of the selected device, or -1 on error.
	 *
	 *	@note This is still useful for e.g. getting the more powerful device
	 *		in a multi-device context in some master-slave type computation.
	 */
	template <class CScoringObjective>
	static size_t n_Get_Best_DeviceId(cl_device_id *p_device_id,
		cl_context h_context, CScoringObjective objective) // unfortunately msvc 6.0 requires the function definition here, otherwise it won't compile
	{
		std::vector<cl_device_id> device_list;
		if(n_GetDeviceList(h_context, device_list) != CL_SUCCESS || device_list.empty())
			return size_t(-1);
		// get all the devices

		size_t n_best_device = 0;
		if(device_list.size() > 1) {
			typename CScoringObjective::TObjective t_best_objective = objective(device_list.front());
			for(size_t i = 1, n = device_list.size(); i < n; ++ i) {
				typename CScoringObjective::TObjective t_objective = objective(device_list[i]);
				if(t_best_objective < t_objective) {
					t_best_objective = t_objective;
					n_best_device = i;
				}
			}
		}
		// go trough devices, and score them

		*p_device_id = device_list[n_best_device];
		// write handle to the best device

		return n_best_device;
	}

	/**
	 *	@brief gets id of device with best score
	 *
	 *	@tparam CScoringObjective is scoring objective (e.g. TDevice_GFLOPS)
	 *
	 *	@param[out] p_device_id is pointer where id of the most
	 *		powerful device handle is written upon successful return
	 *	@param[in] h_platform is handle to OpenCL platform
	 *
	 *	@return Returns index of the selected device, or -1 on error.
	 */
	template <class CScoringObjective>
	static size_t n_Get_Best_DeviceId(cl_device_id *p_device_id,
		cl_platform_id h_platform, cl_device_type n_device_type,
		CScoringObjective objective) // unfortunately msvc 6.0 requires the function definition here, otherwise it won't compile
	{
		std::vector<cl_device_id> device_list;
		if(n_GetDeviceList(h_platform, device_list, n_device_type) != CL_SUCCESS || device_list.empty())
			return size_t(-1);
		// get all the devices

		size_t n_best_device = 0;
		if(device_list.size() > 1) {
			typename CScoringObjective::TObjective t_best_objective = objective(device_list.front());
			for(size_t i = 1, n = device_list.size(); i < n; ++ i) {
				typename CScoringObjective::TObjective t_objective = objective(device_list[i]);
				if(t_best_objective < t_objective) {
					t_best_objective = t_objective;
					n_best_device = i;
				}
			}
		}
		// go trough devices, and score them

		*p_device_id = device_list[n_best_device];
		// write handle to the best device

		return n_best_device;
	}

	/**
	 *	@brief gets id of device with maximum (theoretical) computing
	 *		power or device with more memory
	 *
	 *	@param[in] h_context is handle to OpenCL GPU-type context
	 *
	 *	@return Returns index of the most powerful device, or -1 on error.
	 *
	 *	@note This is still useful for e.g. getting the more powerful device
	 *		in a multi-device context in some master-slave type computation.
	 */
	static size_t n_Get_MaxGFlops_DeviceId(cl_context h_context);

	/**
	 *	@brief gets id of device with maximum (theoretical) computing power
	 *
	 *	@param[out] p_device_id is pointer where id of the most
	 *		powerful device handle is written upon successful return
	 *	@param[in] h_context is handle to OpenCL GPU-type context
	 *
	 *	@return Returns index of the most powerful device, or -1 on error.
	 *
	 *	@note This is still useful for e.g. getting the more powerful device
	 *		in a multi-device context in some master-slave type computation.
	 */
	static size_t n_Get_MaxGFlops_DeviceId(cl_device_id *p_device_id, cl_context h_context);

	/**
	 *	@brief gets platform info string
	 *
	 *	@param[out] r_s_str is reference to output string
	 *	@param[in] h_platform is target platform
	 *	@param[in] n_name is requrested value name (e.g. CL_PLATFORM_NAME)
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	static CLresult n_GetPlatformInfoString(std::string &r_s_str, cl_platform_id h_platform, int n_name);
};

/**
 *	@brief device parameters wrapper
 *
 *	This class is able to read-in device parameters,
 *		and hold them in memory in user-friendly form.
 */
class CCLDeviceParams {
public:
	/**
	 *	@brief device properties structure (CUDA lookalike)
	 */
	struct CLdevprop {
		int maxThreadsPerBlock;     /**< Maximum number of threads per block */
		int maxThreadsDim[3];       /**< Maximum size of each dimension of a block */
		int maxGridSize[3];         /**< Maximum size of each dimension of a grid */
		int sharedMemPerBlock;      /**< Shared memory available per block in bytes */
		int totalConstantMemory;    /**< Constant memory available on device in bytes */
		int SIMDWidth;              /**< Warp size in threads */
		int memPitch;               /**< Maximum pitch in bytes allowed by memory copies */
		int regsPerBlock;           /**< 32-bit registers available per block */
		int clockRate;              /**< Clock frequency in kilohertz */
		int textureAlign;           /**< Alignment requirement for textures */
	};

protected:
	cl_device_id m_h_device;
	std::string m_s_name;
	int m_p_device_caps[2];
	int m_n_multiprocessor_num;
	uint64_t m_n_memory_size;
	bool m_b_kernel_exec_timeout;
	CLdevprop m_t_devprop;

public:
	/**
	 *	@brief default constructor
	 *
	 *	Reads device parameters.
	 *
	 *	@param[in] h_device is device handle
	 *
	 *	@note OpenCL must be initialized before calling this function.
	 *	@note It is recommended to call b_Status() afterwards to see if constructor succeeded.
	 */
	CCLDeviceParams(cl_device_id h_device);

	/**
	 *	@brief constructor
	 *
	 *	Reads device parameters.
	 *
	 *	@param[in] h_context is handle to OpenCL GPU-type context
	 *	@param[in] n_device_index is device index (must be 0 to cuDeviceGetCount() - 1)
	 *
	 *	@note OpenCL must be initialized before calling this function.
	 *	@note It is recommended to call b_Status() afterwards to see if constructor succeeded.
	 */
	CCLDeviceParams(cl_context h_context, size_t n_device_index);

	/**
	 *	@brief prints some basic info about the device
	 *
	 *	@param[in] p_fw is output stream (stdout by default)
	 */
	void Dump(FILE *p_fw = stdout);

	/**
	 *	@brief gets device info string
	 *
	 *	@param[out] r_s_str is reference to output string
	 *	@param[in] n_name is requrested value name (e.g. CL_DEVICE_EXTENSIONS)
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	CLresult n_GetDeviceInfoString(std::string &r_s_str, int n_name);

	/**
	 *	@brief gets device info string
	 *
	 *	@param[out] r_s_str is reference to output string
	 *	@param[in] h_device is target device
	 *	@param[in] n_name is requrested value name (e.g. CL_DEVICE_EXTENSIONS)
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	static CLresult n_GetDeviceInfoString(std::string &r_s_str, cl_device_id h_device, int n_name);

	/**
	 *	@brief determines whether is specified extension supported by the hardware, or not
	 *
	 *	@param[in] p_s_extension_name is name of extension queried
	 *
	 *	@return Returns true if extension is supported, or false if it's not.
	 */
	bool b_IsExtensionSupported(const char *p_s_extension_name) const;

	/**
	 *	@brief determines whether is specified extension supported by the hardware, or not
	 *
	 *	@param[in] h_device is target device
	 *	@param[in] p_s_extension_name is name of extension queried
	 *
	 *	@return Returns true if extension is supported, or false if it's not.
	 */
	static bool b_IsExtensionSupported(cl_device_id h_device, const char *p_s_extension_name);

	/**
	 *	@brief gets device properties structure
	 *
	 *	@param[out] r_t_devprop is reference to output structure
	 *	@param[in] h_device is target device
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	static CLresult n_Query_DeviceProperties(CLdevprop &r_t_devprop, cl_device_id h_device);

	/**
	 *	@brief determines wheter constructor succeeded
	 *
	 *	In case this function returns false, this object doesn't contain
	 *		valid device parameters, and may not be further used.
	 *
	 *	@return Returns true on success, false on failure.
	 */
	inline bool b_Status() const;

	/**
	 *	@brief gets device handle
	 *	@return Returns device handle.
	 */
	inline cl_device_id h_Device() const;

	/**
	 *	@brief gets device platform
	 *	@param[out] r_h_platform is reference to store device platform id
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	inline CLresult n_GetPlatform(cl_platform_id &r_h_platform) const;

	/**
	 *	@brief determines whether the device is an NVIDIA GPU
	 *	@return Returns true if the device is an NVIDIA GPU, otherwise returns false.
	 *	@note This uses cl_nv_device_attribute_query to determine the vendor.
	 */
	inline bool b_Is_NVIDIA() const;

	/**
	 *	@brief gets device major revision number
	 *	@return Returns device major revision number.
	 *	@note This returns 1 on non-NVIDIA platforms.
	 */
	inline unsigned int n_NV_ComputeCap_Major() const;

	/**
	 *	@brief gets device minor revision number
	 *	@return Returns device minor revision number.
	 *	@note This returns 0 on non-NVIDIA platforms.
	 */
	inline unsigned int n_NV_ComputeCap_Minor() const;

	/**
	 *	@copydoc n_NV_ComputeCap_Major()
	 *	&deprecated This is deprecated in favor of n_NV_ComputeCap_Major().
	 */
	inline unsigned int n_Revision_Major() const;

	/**
	 *	@copydoc n_NV_ComputeCap_Minor()
	 *	&deprecated This is deprecated in favor of n_NV_ComputeCap_Minor().
	 */
	inline unsigned int n_Revision_Minor() const;

	/**
	 *	@brief gets device multiprocessor count
	 *	@return Returns device multiprocessor count.
	 */
	inline size_t n_Multiprocessor_Num() const;

	/**
	 *	@brief gets device memory size
	 *	@return Returns device memory size, in bytes.
	 */
	inline uint64_t n_Memory_Size() const;

	/**
	 *	@brief gets device memory size in MiB
	 *	@return Returns device memory size, in MiB, round up.
	 */
	inline size_t n_Memory_Size_MB() const;

	/**
	 *	@brief determines wheter device has kernel execution timeout
	 *	@return Returns true if device has kernel execution timeout, otherwise returns false.
	 */
	inline bool b_Has_KernelExecTimeout() const;

	/**
	 *	@brief gets device properties structure
	 *	@return Returns device properties structure.
	 */
	inline const CLdevprop &t_Properties() const;

	/**
	 *	@brief gets device name
	 *	@return Returns device name.
	 */
	inline const std::string &s_Name() const;

	/**
	 *	@brief gets null-terminated string containing device name
	 *	@return Returns null-terminated string containing device name.
	 */
	inline const char *p_s_Name() const;

	/**
	 *	@brief determines wheter can problem of given size be executed in a single kernel call
	 *	@param[in] n_width is problem width
	 *	@param[in] n_height is problem height
	 *	@param[in] n_depth is problem depth
	 *	@return Returns true if problem fits, otherwise returns false (problem needs to be subdivided first).
	 */
	inline bool b_ProblemFitsAtOnce(int n_width, int n_height, int n_depth) const;

	/**
	 *	@brief determines wheter can problem of given size be executed in a single kernel call,
	 *		and if it can, calculates thread block and grid sizes
	 *	@param[out] p_block_size is pointer thread block width, height and depth is written
	 *		to uppon succesful return (must be allocated)
	 *	@param[out] p_grid_size is pointer grid width, height and depth is written to uppon
	 *		succesful return (must be allocated)
	 *	@param[in] n_width is problem width
	 *	@param[in] n_height is problem height
	 *	@param[in] n_depth is problem depth
	 *	@return Returns true if problem fits, otherwise returns false (problem needs to be subdivided first).
	 */
	inline bool CalculateGridParams(int *p_block_size, int *p_grid_size,
		int n_width, int n_height, int n_depth) const;

protected:
	/**
	 *	@brief fills-in device parameters
	 *	@return Returns true on success, false on failure.
	 */
	bool QueryDeviceParams();
};

/**
 *	@brief simple class for storing OpenCL program binaries in a file
 *
 *	There can be multiple versions of binaries built from the same source
 *	and for the same devices, but with different compiler configuration
 *	(\#defines for example) in a single file. Maximal number of these program
 *	instances is limited, there's least-recently-used policy for determining
 *	wheter to keep or to discard a particular instance.
 */
class CCLProgramStorage {
public:
	enum {
		npos = -1 /**< @brief no-position for n_Find_Program() */
	};

	enum {
		build_Success = 0, /**< @brief n_Get_Program() succeeded */
		build_InvalidParams = -1, /**< @brief invalid number of devices or invalid binary index */
		build_LowMemory = -2, /**< @brief not enough memory */
		build_CreateProgramFailure = -3, /**< @brief clCreateProgramWithBinary() failed */
		build_BuildProgramFailure = -4 /**< @brief clBuildProgram() failed */
	};

	/**
	 *	@brief instance of program, built for (multiple) devices
	 */
	struct TProgramBinary {
		/**
		 *	@brief binary representation of program built for a single device
		 */
		struct TBinary {
			uint64_t n_size; /**< @brief size of binary, in bytes */
			uint8_t *p_data; /**< @brief binary data */

			/**
			 *	@brief default constructor; clears the fields
			 *	@note this structure is allocated and deallocated explicitly
			 */
			inline TBinary();
		};

		std::vector<size_t> binary_size_list; /**< @brief list of binaries lengths for each device (binary_size_list.size() must be equal to TProgram::n_instance_num) */
		TBinary t_data_buffer; /**< @brief concatenated data of all the binaries */

		/**
		 *	@brief downloads program binaries from compiled program to this
		 *
		 *	@param[in] h_program is compiled OpenCL program
		 *	@param[in] n_device_num is number of devices the program was compiled for
		 *
		 *	@return Returns true on success, false on failure.
		 *
		 *	@note This object does not have an implicit destructor, the caller is reponsible
		 *		of disposing of the object by calling Free() once it is no longer needed.
		 */
		bool Download(cl_program h_program, size_t n_device_num);

		/**
		 *	@brief uploads program binaries from this to a compiled program
		 *
		 *	@param[in] h_context is OpenCL context
		 *	@param[out] p_program is pointer to OpenCL program where result is written upon successful return
		 *	@param[out] r_n_result is reference to OpenCL error code (for error checking)
		 *	@param[in] n_device_num is number of devices, program is compiled for
		 *	@param[in] p_device_list is list of devices, program is compiled for (the order must match, it is not checked)
		 *
		 *	@returns build_Success on success, or another build_* on failure.
		 */
		int n_Upload(cl_context h_context, cl_program *p_program, CLresult &r_n_result,
			size_t n_device_num, const cl_device_id *p_device_list) const;

		/**
		 *	@brief disposes of program data
		 *	@note This has no effect if this is empty (can be called multiple times).
		 */
		void Free();

		/**
		 *	@brief swaps binary data with other binary
		 *	@param[in,out] r_t_other is the other binary to swap with
		 */
		inline void Swap(TProgramBinary &r_t_other);
	};

protected:
#pragma pack(1)

	/**
	 *	@brief raw SHA-1 hash (physical layout in a file)
	 */
	struct _TSHA1 {
		uint32_t p_data[5]; /**< @brief hash contents (little endian, first element contains the most significant word) */

		/**
		 *	@brief equality comparator
		 *	@param[in] r_t_other is hash to compare against
		 *	@return Returns true if this hash equals r_t_other, otherwise returns false.
		 */
		inline bool operator ==(const _TSHA1 &r_t_other) const;

		/**
		 *	@brief inequality comparator
		 *	@param[in] r_t_other is hash to compare against
		 *	@return Returns false if this hash equals r_t_other, otherwise returns true.
		 */
		inline bool operator !=(const _TSHA1 &r_t_other) const;
	};

	/**
	 *	@brief program file header (physical layout in a file)
	 */
	struct TFileHeader {
		uint8_t p_magic[4]; /**< @brief magic word (file signature), containing "LCLs" */
		_TSHA1 t_control_hash; /**< @brief hash of the whole file (for checking file integrity) */
		_TSHA1 t_source_hash; /**< @brief hash of program source code */
	};

#pragma pack()

	/**
	 *	@brief instance of program, built for (multiple) devices, with specific build options
	 */
	struct TProgramInstance : public TProgramBinary {
		_TSHA1 t_build_opts_hash; /**< @brief hash of build options (compiler param string) */

		bool Download(cl_program h_program, size_t n_device_num, _TSHA1 t_build_options_hash);
		bool Read(FILE *p_fr, size_t n_device_num);
		bool Write(FILE *p_fw, size_t n_device_num);
		inline void Swap(TProgramInstance &r_t_other);
	};

	TFileHeader m_t_header; /**< @brief file header */
	std::vector<std::string> device_id_list; /**< @brief list of device names (those tend to be shorter than their hashes) */
	mutable std::vector<TProgramInstance> instance_list; /**< @brief list of program instances (multiple instances, for different configurations) */

	mutable bool m_b_dirty; /**< @brief dirty flag - the file was modified, and should be resaved */

public:
	/**
	 *	@brief default constructor; initializes an empty program storage
	 */
	CCLProgramStorage();

	/**
	 *	@brief constructor; loads program storage from a file
	 *	@param[in] p_s_filename is path to file containing program binaries
	 *	@note This function may fail, it is therefore recommended to call b_Status() afterwards.
	 */
	CCLProgramStorage(const char *p_s_filename);

	/**
	 *	@brief destructor
	 */
	~CCLProgramStorage();

	/**
	 *	@brief determines constructor success
	 *	@return Returns true if constructor succeeded, otherwise returns false.
	 */
	bool b_Status() const;

	/**
	 *	@brief determines whether data changed
	 *	@return Returns true if data in memory differ from what's in the file, otherwise returns false.
	 *	@note This flag is raised by n_Find_ProgramBinaries() (in some cases) and by Put_ProgramBinaries().
	 *		It is cleared by Load() and Save().
	 */
	bool b_Dirty() const;

	/**
	 *	@brief finds program binaries based on source code and environment
	 *
	 *	@param[in] p_s_source_code is program source code
	 *	@param[in] n_device_num is number of devices, program is compiled for
	 *	@param[in] p_device_list is list of devices, program is compiled for
	 *	@param[in] p_s_build_options is build options string (contains compiler directives)
	 *
	 *	@return Returns index of program binaries, or npos in case no matching binaries were found.
	 *
	 *	@note This invalidates any indices obtained by previous calls to this functions.
	 */
	size_t n_Find_ProgramBinaries(const char *p_s_source_code, size_t n_device_num,
		const cl_device_id *p_device_list, const char *p_s_build_options) const;

	/**
	 *	@brief creates OpenCL program from binaries
	 *
	 *	@param[in] h_context is OpenCL context
	 *	@param[out] p_program is pointer to OpenCL program where result is written upon successful return
	 *	@param[out] r_n_result is reference to OpenCL error code (for error checking)
	 *	@param[in] n_binaries_index is index of compiled binaries (obtained by calling n_Find_ProgramBinaries())
	 *	@param[in] n_device_num is number of devices, program is compiled for
	 *	@param[in] p_device_list is list of devices, program is compiled for
	 *
	 *	@return Returns build_Success (0) on success, or one of the other build_* names (negative value) on failure:
	 */
	int n_Get_ProgramBinaries(cl_context h_context, cl_program *p_program, CLresult &r_n_result,
		size_t n_binaries_index, size_t n_device_num, const cl_device_id *p_device_list) const;

	/**
	 *	@brief adds built program binaries to the storage
	 *
	 *	@param[in] h_program is OpenCL program handle
	 *	@param[in] p_s_source_code is program source code
	 *	@param[in] n_device_num is number of devices, program has been compiled for
	 *	@param[in] p_device_list is list of devices, program has been compiled for
	 *	@param[in] p_s_build_options is build options string (contains compiler directives)
	 *
	 *	@return Returns true on success, false on failure (not enough memory).
	 */
	bool Put_ProgramBinaries(cl_program h_program, const char *p_s_source_code,
		size_t n_device_num, const cl_device_id *p_device_list, const char *p_s_build_options);

	/**
	 *	@brief reads binaries from a file
	 *	@param[in] p_s_filename is path to file containing program binaries
	 *	@return Returns true on success, false on failure.
	 */
	bool Load(const char *p_s_filename);

	/**
	 *	@brief saves binaries in the storage to a file
	 *
	 *	@param[in] p_s_filename is output file name
	 *	@param[in] n_max_program_instance_num is limit of stored versions of program binaries
	 *
	 *	@return Returns true on success, false on failure.
	 */
	bool Save(const char *p_s_filename, size_t n_max_program_instance_num = 32);

protected:
	static _TSHA1 t_Hash_String(const char *p_s_str);
	static inline void DeleteInstance(TProgramInstance &r_t_inst);
	bool Calc_ControlHash(_TSHA1 &r_t_result, size_t n_instance_limit = SIZE_MAX) const;
	bool b_Check_DeviceIds(size_t n_device_num, const cl_device_id *p_device_list) const;
};

/**
 *	@brief utility class for calling OpenCL compiler and caching of compiled programs in a file
 */
class CCLProgramCompiler {
public:
	/**
	 *	@brief status word flag names
	 */
	enum {
		cache_ReadAttempted = 1, /**< reading of program binaries from cache file was attempted */
			cache_ReadSucceeded = 2, /**< reading of program binaries from cache file was successful */
			cache_ReadFailed_FileNotFound = 4, /**< reading of program binaries failed because specified file couldn't be opened */
			cache_ReadFailed_IO = 8, /**< reading of program binaries failed because of i/o error */
			cache_ReadFailed_OutOfMemory = 16, /**< reading of program binaries failed because of memory allocation error */
			cache_ReadFailed_SourceChecksum = 32, /**< reading of program binaries failed because source code is different (cache miss) */
			cache_ReadFailed_BinaryChecksum = 64, /**< reading of program binaries failed because it's corrupt */
			cache_ReadFailed_CreateProgram = 128, /**< reading of program binaries failed because clCreateProgramWithBinary() failed (wrong device?) */
		prog_CompiledFromSource = 256, /**< program needed to be compiled from source code at runtime */
			prog_CreateSucceeded = 512, /**< program was compiled, clCreateProgramWithSource() succeeded */
			prog_BuildSucceeded = 1024, /**< program was compiled, clBuildProgram() succeeded */
		cache_WriteAttempted = 2048, /**< attempted to write cache file with program binaries */
			cache_WriteSucceeded = 4096, /**< writing program binaries was successful */
				cache_WriteTrimmed = 8192 /**< not all program versions fit in the file (hint to increase n_max_program_instance_num) */
	};

	/**
	 *	@brief prints human-readable information from status word to specified stream
	 *
	 *	@param[in] n_status_word is status word, returned in last
	 *		parameter of n_CompileProgram() or n_CompileProgramFile()
	 *	@param[in] p_fw is output stream (stdout by default)
	 */
	static void Dump_StatusWord(int n_status_word, FILE *p_fw = stdout);

	/**
	 *	@brief compiles OpenCL "C" program, while allowing for it's binary to be cached
	 *
	 *	In case program is compiled from source code and build fails, build log is printed
	 *	for each device to stderr (CL_PROGRAM_COMPILER_DISPLAY_BUILD_ERRORS must be defined).
	 *	In case CL_PROGRAM_COMPILER_DISPLAY_SUCCESSFUL_BUILD_WARNINGS is defined, build log
	 *	is displayed even if build succeeds (only if it's not empty).
	 *
	 *	The p_s_cache_file argument supports simple path substitutions. In case it starts
	 *	with "%temp_folder%", it is substituted to the name of the temp folder, not ending with
	 *	slash. In case it starts with "%temp_default%", it is substituted with name of the temp
	 *	folder, module name, and suffixed with ".clbin". Use as follows:
	 *
	 *	"%temp_folder%/MyApp_MyKernel.clbin" substitutes into: "/tmp/MyApp_MyKernel.clbin"
	 *
	 *	Or alternatively:
	 *
	 *	"%temp_default%MyKernel" substitutes into "/tmp/MyApp_MyKernel.clbin", where MyApp is
	 *	the current module name. This is intended for use in libraries where the module name
	 *	that will use the library in question is not known yet.
	 *
	 *	The purpose of these substitutions is to make it easier to store the cached binaries
	 *	in the temporary folder instead of in the current working directory where the
	 *	application may not have write permission (if installed e.g. in /bin/ or in Program
	 *	Files). Note that NVIDIA compiler also internally caches the binaries in temp folder.
	 *
	 *	@param[in] h_context is OpenCL context
	 *	@param[out] p_program is pointer to program handle, which is written upon successful return
	 *	@param[in] p_s_source is program source code
	 *	@param[in] n_device_num is number of devices program is compiled for, for more
	 *		information refer to clBuildProgram() function documentation
	 *	@param[in] p_device_list is list of devices program is compiled for (or NULL
	 *		if n_device_num is 0), for more information refer to clBuildProgram() function documentation
	 *	@param[in] p_s_build_options is string with OpenCL build options, for more
	 *		information refer to clBuildProgram() function documentation
	 *	@param[in] p_s_cache_file is either null, or filename where compiled
	 *		binary (along with source checksum) should be found/stored
	 *	@param[out] p_status_word is pointer to integer where status word should be written, or NULL
	 *	@param[in] n_max_program_instance_num is maximal number of program instances saved to
	 *		a file (see CCLProgramStorage documentation); note if CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT
	 *		is defined, this value is forced to 1
	 *
	 *	@return Returns CL_SUCCESS on success, or other OpenCL error code on failure. The function
	 *		succeeds regardless of whether program failed to load from file, or whether binaries
	 *		couldn't be written to cache file.
	 *
	 *	@note Without specifying at least one device in the list, caching feature
	 *		will not be able to save/load the program.
	 *	@note Without specifying at least one device in the list, build log won't be displayed on build error.
	 */
	static CLresult n_CompileProgram(cl_context h_context, cl_program *p_program,
		const char *p_s_source, size_t n_device_num = 0, const cl_device_id *p_device_list = 0,
		const char *p_s_build_options = "", const char *p_s_cache_file = 0, int *p_status_word = 0,
		size_t n_max_program_instance_num = 32);

	/**
	 *	@brief compiles OpenCL "C" program from file, while allowing for it's binary to be cached
	 *
	 *	In case program is compiled from source code and build fails, build log is printed
	 *	for each device to stderr (CL_PROGRAM_COMPILER_DISPLAY_BUILD_ERRORS must be defined).
	 *	In case CL_PROGRAM_COMPILER_DISPLAY_SUCCESSFUL_BUILD_WARNINGS is defined, build log
	 *	is displayed even if build succeeds (only if it's not empty).
	 *
	 *	@param[in] h_context is OpenCL context
	 *	@param[out] p_program is pointer to program handle, which is written upon successful return
	 *	@param[in] p_s_source_file is name of file with program source code
	 *	@param[in] n_device_num is number of devices program is compiled for, for more
	 *		information refer to clBuildProgram() function documentation
	 *	@param[in] p_device_list is list of devices program is compiled for (or NULL
	 *		if n_device_num is 0), for more information refer to clBuildProgram() function documentation
	 *	@param[in] p_s_build_options is string with OpenCL build options, for more
	 *		information refer to clBuildProgram() function documentation
	 *	@param[in] p_s_cache_file is either null, or filename where compiled
	 *		binary (along with source checksum) should be found/stored
	 *	@param[out] p_status_word is pointer to integer where status word should be written, or NULL
	 *	@param[in] n_max_program_instance_num is maximal number of program instances saved to
	 *		a file (see CCLProgramStorage documentation); note if CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT
	 *		is defined, this value is forced to 1
	 *
	 *	@return Returns CL_SUCCESS on success, or other OpenCL error code on failure. The function
	 *		succeeds regardless of whether program failed to load from file, or whether binaries
	 *		couldn't be written to cache file.
	 *
	 *	@note Without specifying at least one device in the list, caching feature
	 *		will not be able to save/load the program.
	 *	@note Without specifying at least one device in the list, build log won't be displayed on build error.
	 */
	static CLresult n_CompileProgramFile(cl_context h_context, cl_program *p_program,
		const char *p_s_source_file, size_t n_device_num = 0, const cl_device_id *p_device_list = NULL,
		const char *p_s_build_options = "", const char *p_s_cache_file = 0, int *p_status_word = 0,
		size_t n_max_program_instance_num = 32);

	template <class CPostprocess>
	static bool Postprocess_ProgramBinary(cl_context h_context, cl_program *p_program,
		cl_program h_program, CPostprocess t_postprocess_object)
	{
		try {
			cl_uint n_device_num;
			if(clGetProgramInfo(h_program, CL_PROGRAM_NUM_DEVICES,
			   sizeof(size_t), &n_device_num, NULL) != CL_SUCCESS)
				return false;
			// get number of devices, make sure it matches description

			std::vector<cl_device_id> device_list(n_device_num);
			if(clGetProgramInfo(h_program, CL_PROGRAM_DEVICES, n_device_num * sizeof(cl_device_id),
			   &device_list[0], NULL) != CL_SUCCESS)
				return false;
			// get the devices

			CCLProgramStorage::TProgramBinary b;
			if(!b.Download(h_program, n_device_num))
				return false;
			// download

			if(!t_postprocess_object(b))
				return false;
			// postprocess to get new binaries (todo - figure out how)

			CLresult n_result; // silently ignored
			return b.n_Upload(h_context, p_program, n_result,
				n_device_num, &device_list[0]) == CCLProgramStorage::build_Success;
		} catch(std::bad_alloc&) {
			return false;
		}
	}

	/**
	 *	@brief gets build status and build log for specified program and device
	 *
	 *	@param[out] r_s_build_log is string where build log is written upon successful return
	 *	@param[out] r_n_build_status is string where build status is written upon successful return
	 *	@param[in] h_program is program in question
	 *	@param[in] h_device is (one of) target device(s) for which program was built
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	static CLresult n_Get_BuildLog(std::string &r_s_build_log, cl_build_status &r_n_build_status,
		cl_program h_program, cl_device_id h_device);

#ifdef CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT
	/**
	 *	@brief calculates SHA1 hash of source code and build options strings
	 *
	 *	@param[in] p_s_source is OpenCL "C" source code
	 *	@param[in] p_s_build_options is OpenCL compiler options string (may be NULL)
	 *
	 *	@return Returns SHA1 of both strings.
	 */
	static TSHA1 t_Hash_ProgramSource_BuildOptions(const char *p_s_source, const char *p_s_build_options = 0);

	/**
	 *	@brief reads program binaries and creates cl_program
	 *
	 *	@param[in] p_s_filename is file, containing binaries (saved using WriteProgramBinaries())
	 *	@param[in] t_hash is hash of source code and build options (obtained using t_Hash_ProgramSource_BuildOptions())
	 *	@param[in] h_context is OpenCL context
	 *	@param[out] p_program is pointer to program handle, which is written upon successful return
	 *	@param[in] n_device_num is number of devices program is compiled for, for more
	 *		information refer to clBuildProgram() function documentation
	 *	@param[in] p_device_list is list of devices program is compiled for (or NULL
	 *		if n_device_num is 0), for more information refer to clBuildProgram() function documentation
	 *
	 *	@return Returns cache_ReadSucceeded on success, or one of cache_ReadFailed_* on failure.
	 */
	static int n_ReadProgramBinaries(const char *p_s_filename, TSHA1 t_hash, cl_context h_context,
		cl_program *p_program, size_t n_device_num, const cl_device_id *p_device_list);

	/**
	 *	@brief writes binaries of cl_program to a file
	 *
	 *	@param[in] h_program is handle to an existing OpenCL program
	 *	@param[in] t_hash is hash of program source code and build options (obtained using t_Hash_ProgramSource_BuildOptions())
	 *	@param[in] p_s_filename is output file name
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool WriteProgramBinaries(cl_program h_program, TSHA1 t_hash, const char *p_s_filename);
#endif // CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT
};

/**
 *	@brief unique OpenCL handle
 *
 *	@tparam CCLHandle is handle type
 *	@tparam CInterface is interface object with member m_h_handle and a static Destroy(CCLHandle) function
 */
template <class CCLHandle, class CInterface>
class CCLUniqueWrapper : public CInterface {
public:
	typedef CCLHandle TCLHandle; /**< @brief handle type */

public:
	/**
	 *	@brief default constructor; sets handle to a specified value
	 *	@param[in] h_handle is OpenCL handle to become managed (default null handle)
	 *	@note The specified handle will be released automatically.
	 */
	explicit inline CCLUniqueWrapper(CCLHandle h_handle = 0);

	/**
	 *	@brief copy-constructor
	 *	@param[in,out] r_other is unique handle to copy from (it will lose ownership of the handle)
	 */
	inline CCLUniqueWrapper(/*const*/ CCLUniqueWrapper &r_other);

	/**
	 *	@brief destructor; destroys the currently owned handle, if any
	 */
	inline ~CCLUniqueWrapper();

	/**
	 *	@brief copy operator; destroys the currently owned handle, if any
	 *	@param[in,out] r_other is unique handle to copy from (it will lose ownership of the handle)
	 *	@return Returns reference to this.
	 */
	inline CCLUniqueWrapper &operator =(/*const*/ CCLUniqueWrapper &r_other);

	/**
	 *	@brief copy operator; destroys the currently owned handle, if any
	 *	@param[in] h_handle is handle to assign (will be owned by this from now on)
	 *	@return Returns reference to this.
	 */
	inline CCLUniqueWrapper &operator =(CCLHandle h_handle);

	/**
	 *	@brief destroys the currently owned handle, if any
	 */
	inline void Destroy();

	/**
	 *	@brief yields ownership of the handle, which is to be no longer managed
	 *	@return Returns currently owned handle. This object owns no handle upon return.
	 *	@note This would be usually called "Release" but OpenCL uses Release to
	 *		destroy objects and the meaning would be misleading.
	 */
	inline CCLHandle h_YieldOwnership();

	/**
	 *	@brief conversion to handle for use in OpenCL functions
	 *	@return Returns the currently owned handle (may be a null handle).
	 *	@note This does not yield ownership of the handle which stays managed.
	 */
	inline CCLHandle h_Get() const;

	/**
	 *	@brief swaps two unique handles
	 *	@param[in,out] r_other is handle to swap with
	 */
	inline void Swap(CCLUniqueWrapper &r_other);
};

/**
 *	@brief extended unique OpenCL handle
 *
 *	@tparam CCLHandle is handle type
 *	@tparam CInterface is interface object with member m_h_handle and a static Destroy(CCLHandle) function
 */
template <class CCLHandle, class CInterface>
class CCLUniqueWrapperEx : public CCLUniqueWrapper<CCLHandle, CInterface> {
public:
	/**
	 *	@brief default constructor; sets handle to a specified value
	 *	@param[in] h_handle is OpenCL handle to become managed (default null handle)
	 *	@note The specified handle will be released automatically.
	 */
	explicit inline CCLUniqueWrapperEx(CCLHandle h_handle = 0);

	/**
	 *	@brief copy-constructor
	 *	@param[in,out] r_other is unique handle to copy from (it will lose ownership of the handle)
	 */
	inline CCLUniqueWrapperEx(/*const*/ CCLUniqueWrapperEx &r_other);

	/**
	 *	@brief copy operator; destroys the currently owned handle, if any
	 *	@param[in,out] r_other is unique handle to copy from (it will lose ownership of the handle)
	 *	@return Returns reference to this.
	 */
	inline CCLUniqueWrapperEx &operator =(/*const*/ CCLUniqueWrapperEx &r_other);

	/**
	 *	@brief copy operator; destroys the currently owned handle, if any
	 *	@param[in] h_handle is handle to assign (will be owned by this from now on)
	 *	@return Returns reference to this.
	 */
	inline CCLUniqueWrapperEx &operator =(CCLHandle h_handle);

	/**
	 *	@brief conversion to handle for use in OpenCL functions
	 *	@return Returns the currently owned handle (may be a null handle).
	 *	@note This does not yield ownership of the handle which stays managed.
	 */
	inline operator CCLHandle() const;

	/**
	 *	@brief conversion to const handle pointer for use in OpenCL functions
	 *	@return Returns pointer to the currently owned handle (may be
	 *		a null handle - but not a null pointer).
	 *	@note This does not yield ownership of the handle which stays managed.
	 */
	inline const CCLHandle *operator &() const;

#if 0
	/**
	 *	@brief conversion to handle pointer for use in OpenCL functions
	 *	@return Returns pointer to the currently owned handle (may be
	 *		a null handle - but not a null pointer).
	 *	@note This does not yield ownership of the handle which stays managed.
	 */
	//inline CCLHandle *operator &(); // not permitted, as it would not destroy the owned object correctly
#endif // 0

	/**
	 *	@brief swaps two unique handles
	 *	@param[in,out] r_other is handle to swap with
	 */
	inline void Swap(CCLUniqueWrapperEx &r_other);
};

/**
 *	@brief destructor for CCLUniqueEvent
 */
class CCLEventDestructor {
protected:
	cl_event m_h_handle; /**< @brief managed OpenCL handle */

public:
	/**
	 *	@brief destroys the given handle
	 *	@param[in] h_event is handle to the object to be destroyed
	 */
	static inline void Destroy(cl_event h_event);
};

/**
 *	@brief destructor for CCLUniqueKernel
 */
class CCLKernelDestructor {
protected:
	cl_kernel m_h_handle; /**< @brief managed OpenCL handle */

public:
	/**
	 *	@brief destroys the given handle
	 *	@param[in] cl_kernel is handle to the object to be destroyed
	 */
	static inline void Destroy(cl_kernel h_kernel);
};

/**
 *	@brief destructor for CCLUniqueMem
 */
class CCLMemObjectDestructor {
protected:
	cl_mem m_h_handle; /**< @brief managed OpenCL handle */

public:
	/**
	 *	@brief destroys the given handle
	 *	@param[in] h_mem_object is handle to the object to be destroyed
	 */
	static inline void Destroy(cl_mem h_mem_object);
};

/**
 *	@brief unique OpenCL event handle
 */
typedef CCLUniqueWrapperEx<cl_event, CCLEventDestructor> CCLUniqueEvent;

/**
 *	@brief unique OpenCL kernel handle
 */
typedef CCLUniqueWrapperEx<cl_kernel, CCLKernelDestructor> CCLUniqueKernel;

/**
 *	@brief unique OpenCL memory object handle
 */
typedef CCLUniqueWrapperEx<cl_mem, CCLMemObjectDestructor> CCLUniqueMem;

/**
 *	@brief utility functions on OpenCL context
 */
class CCLContextInterface {
protected:
	cl_context m_h_handle; /**< @brief OpenCL context */

public:
	/**
	 *	@brief destroys the given handle
	 *	@param[in] h_mem_object is handle to the object to be destroyed
	 */
	static inline void Destroy(cl_context h_context);

	/**
	 *	@brief allocates OpenCL memory buffer in this context
	 *
	 *	@param[out] r_h_mem is reference to store the handle to the new memory object
	 *	@param[in] n_size is size of the buffer, in bytes
	 *	@param[in] n_flags is bitfield with memory flags (default CL_MEM_READ_WRITE)
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	inline CLresult n_CreateBuffer(cl_mem &r_h_mem, size_t n_size, cl_mem_flags n_flags = CL_MEM_READ_WRITE) const;

	/**
	 *	@brief allocates OpenCL memory buffer in this context
	 *
	 *	@param[out] r_h_mem is reference to store the handle to the new memory object
	 *	@param[in] p_host_pointer is pointer to the buffer data already initialized by the caller
	 *	@param[in] n_size is size of the buffer, in bytes
	 *	@param[in] n_flags is bitfield with memory flags (e.g. CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR)
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	inline CLresult n_CreateBuffer(cl_mem &r_h_mem, size_t n_size, void *p_host_pointer, cl_mem_flags n_flags) const;

	/**
	 *	@brief allocates OpenCL memory buffer in this context
	 *
	 *	@param[in] n_size is size of the buffer, in bytes
	 *	@param[in] n_flags is bitfield with memory flags (default CL_MEM_READ_WRITE)
	 *
	 *	@return Returns handle to the new memory object on success, null handle on failure.
	 *
	 *	@note Use the managed CCLUniqueMem instead of cl_mem.
	 */
	inline cl_mem h_CreateBuffer(size_t n_size, cl_mem_flags n_flags = CL_MEM_READ_WRITE) const;

	/**
	 *	@brief allocates OpenCL memory buffer in this context
	 *
	 *	@param[in] p_host_pointer is pointer to the buffer data already initialized by the caller
	 *	@param[in] n_size is size of the buffer, in bytes
	 *	@param[in] n_flags is bitfield with memory flags (e.g. CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR)
	 *
	 *	@return Returns handle to the new memory object on success, null handle on failure.
	 *
	 *	@note Use the managed CCLUniqueMem instead of cl_mem.
	 */
	inline cl_mem h_CreateBuffer(size_t n_size, void *p_host_pointer, cl_mem_flags n_flags) const;
};

/**
 *	@brief unique OpenCL command queue handle
 */
typedef CCLUniqueWrapper<cl_context, CCLContextInterface> CCLUniqueContext;

/**
 *	@brief utility functions on OpenCL context
 */
class CCLCommandQueueInterface {
	friend class CCLKernelCall; // uses protexted h_Handle()

protected:
	cl_command_queue m_h_handle; /**< @brief OpenCL command queue */

public:
	/**
	 *	@brief destroys the given handle
	 *	@param[in] h_mem_object is handle to the object to be destroyed
	 */
	static inline void Destroy(cl_command_queue h_cmd_queue);

	/**
	 *	@brief enqueues read buffer operation
	 *
	 *	@param[in] h_device_src is handle of the source memory object
	 *	@param[in] b_blocking is blocking flag (if set, the call does not return until the transfer finishes)
	 *	@param[in] n_offset is offset to the source memory object, in bytes
	 *	@param[in] n_size is amount of data to be copied, in bytes
	 *	@param[out] p_host_dest is pointer to the destination buffer in host memory
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] p_finished_event is pointer to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_ReadBuffer(cl_mem h_device_src, bool b_blocking, size_t n_offset,
		size_t n_size, void *p_host_dest, cl_uint n_wait_for_event_num = 0,
		const cl_event *p_wait_for_event = 0, cl_event *p_finished_event = 0) const;

	/**
	 *	@brief enqueues read buffer operation
	 *
	 *	@param[in] h_device_src is handle of the source memory object
	 *	@param[in] b_blocking is blocking flag (if set, the call does not return until the transfer finishes)
	 *	@param[in] n_offset is offset to the source memory object, in bytes
	 *	@param[in] n_size is amount of data to be copied, in bytes
	 *	@param[out] p_host_dest is pointer to the destination buffer in host memory
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] r_finished_event is reference to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_ReadBuffer(cl_mem h_device_src, bool b_blocking, size_t n_offset,
		size_t n_size, void *p_host_dest, cl_uint n_wait_for_event_num,
		const cl_event *p_wait_for_event, CCLUniqueEvent &r_finished_event) const;

	/**
	 *	@brief enqueues write buffer operation
	 *
	 *	@param[in] h_device_dest is handle of the destination memory object
	 *	@param[in] b_blocking is blocking flag (if set, the call does not return until the transfer finishes)
	 *	@param[in] n_offset is offset to the destination memory object, in bytes
	 *	@param[in] n_size is amount of data to be copied, in bytes
	 *	@param[in] p_host_src is pointer to the source buffer in host memory
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] p_finished_event is pointer to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_WriteBuffer(cl_mem h_device_dest, bool b_blocking, size_t n_offset,
		size_t n_size, const void *p_host_src, cl_uint n_wait_for_event_num = 0,
		const cl_event *p_wait_for_event = 0, cl_event *p_finished_event = 0) const;

	/**
	 *	@brief enqueues write buffer operation
	 *
	 *	@param[in] h_device_dest is handle of the destination memory object
	 *	@param[in] b_blocking is blocking flag (if set, the call does not return until the transfer finishes)
	 *	@param[in] n_offset is offset to the destination memory object, in bytes
	 *	@param[in] n_size is amount of data to be copied, in bytes
	 *	@param[in] p_host_src is pointer to the source buffer in host memory
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] r_finished_event is reference to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_WriteBuffer(cl_mem h_device_dest, bool b_blocking, size_t n_offset,
		size_t n_size, const void *p_host_src, cl_uint n_wait_for_event_num,
		const cl_event *p_wait_for_event, CCLUniqueEvent &r_finished_event) const;

	/**
	 *	@brief enqueues copy buffer operation
	 *
	 *	@param[in] h_device_src is handle of the source memory object
	 *	@param[in] h_device_dest is handle of the destination memory object
	 *	@param[in] n_offset_src is offset to the source memory object, in bytes
	 *	@param[in] n_offset_dest is offset to the destination memory object, in bytes
	 *	@param[in] n_size is amount of data to be copied, in bytes
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] p_finished_event is pointer to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_CopyBuffer(cl_mem h_device_src, cl_mem h_device_dest,
		size_t n_src_offset, size_t n_dest_offset, size_t n_size,
		cl_uint n_wait_for_event_num = 0, const cl_event *p_wait_for_event = 0,
		cl_event *p_finished_event = 0) const;

	/**
	 *	@brief enqueues copy buffer operation
	 *
	 *	@param[in] h_device_src is handle of the source memory object
	 *	@param[in] h_device_dest is handle of the destination memory object
	 *	@param[in] n_offset_src is offset to the source memory object, in bytes
	 *	@param[in] n_offset_dest is offset to the destination memory object, in bytes
	 *	@param[in] n_size is amount of data to be copied, in bytes
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] r_finished_event is reference to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_CopyBuffer(cl_mem h_device_src, cl_mem h_device_dest,
		size_t n_src_offset, size_t n_dest_offset, size_t n_size,
		cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event,
		CCLUniqueEvent &r_finished_event) const;

	/**
	 *	@brief copies data from device to host (enqueues read buffer operation) and waits for the operation to finish
	 *
	 *	@param[out] p_host_dest is pointer to the destination buffer in host memory
	 *	@param[in] h_device_src is handle of the source memory object
	 *	@param[in] n_offset is offset to the source memory object, in bytes
	 *	@param[in] n_size is amount of data to be copied, in bytes
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] p_finished_event is pointer to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_Memcpy_DtoH(void *p_host_dest, cl_mem h_device_src, size_t n_offset,
		size_t n_size, cl_uint n_wait_for_event_num = 0, const cl_event *p_wait_for_event = 0,
		cl_event *p_finished_event = 0) const;

	/**
	 *	@brief copies data from device to host (enqueues read buffer operation) and waits for the operation to finish
	 *
	 *	@param[out] p_host_dest is pointer to the destination buffer in host memory
	 *	@param[in] h_device_src is handle of the source memory object
	 *	@param[in] n_offset is offset to the source memory object, in bytes
	 *	@param[in] n_size is amount of data to be copied, in bytes
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] r_finished_event is reference to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_Memcpy_DtoH(void *p_host_dest, cl_mem h_device_src, size_t n_offset,
		size_t n_size, cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event,
		CCLUniqueEvent &r_finished_event) const;

	/**
	 *	@brief copies data from device to host (enqueues read buffer operation) and returns immediately
	 *
	 *	@param[out] p_host_dest is pointer to the destination buffer in host memory
	 *	@param[in] h_device_src is handle of the source memory object
	 *	@param[in] n_offset is offset to the source memory object, in bytes
	 *	@param[in] n_size is amount of data to be copied, in bytes
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] p_finished_event is pointer to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_Memcpy_DtoH_Async(void *p_host_dest, cl_mem h_device_src, size_t n_offset,
		size_t n_size, cl_uint n_wait_for_event_num = 0, const cl_event *p_wait_for_event = 0,
		cl_event *p_finished_event = 0) const;

	/**
	 *	@brief copies data from device to host (enqueues read buffer operation) and returns immediately
	 *
	 *	@param[out] p_host_dest is pointer to the destination buffer in host memory
	 *	@param[in] h_device_src is handle of the source memory object
	 *	@param[in] n_offset is offset to the source memory object, in bytes
	 *	@param[in] n_size is amount of data to be copied, in bytes
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] r_finished_event is reference to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_Memcpy_DtoH_Async(void *p_host_dest, cl_mem h_device_src, size_t n_offset,
		size_t n_size, cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event,
		CCLUniqueEvent &r_finished_event) const;

	/**
	 *	@brief copies data from host to device (enqueues write buffer operation) and waits for the operation to finish
	 *
	 *	@param[in] h_device_dest is handle of the destination memory object
	 *	@param[in] n_offset is offset to the destination object, in bytes
	 *	@param[in] p_host_src is pointer to the source buffer in host memory
	 *	@param[in] n_size is amount of data to be copied, in bytes
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] p_finished_event is pointer to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_Memcpy_HtoD(cl_mem h_device_dest, size_t n_offset, const void *p_host_src,
		size_t n_size, cl_uint n_wait_for_event_num = 0, const cl_event *p_wait_for_event = 0,
		cl_event *p_finished_event = 0) const;

	/**
	 *	@brief copies data from host to device (enqueues write buffer operation) and waits for the operation to finish
	 *
	 *	@param[in] h_device_dest is handle of the destination memory object
	 *	@param[in] n_offset is offset to the destination object, in bytes
	 *	@param[in] p_host_src is pointer to the source buffer in host memory
	 *	@param[in] n_size is amount of data to be copied, in bytes
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] r_finished_event is reference to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_Memcpy_HtoD(cl_mem h_device_dest, size_t n_offset, const void *p_host_src,
		size_t n_size, cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event,
		CCLUniqueEvent &r_finished_event) const;

	/**
	 *	@brief copies data from host to device (enqueues write buffer operation) and returns immediately
	 *
	 *	@param[in] h_device_dest is handle of the destination memory object
	 *	@param[in] n_offset is offset to the destination object, in bytes
	 *	@param[in] p_host_src is pointer to the source buffer in host memory
	 *	@param[in] n_size is amount of data to be copied, in bytes
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] p_finished_event is pointer to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_Memcpy_HtoD_Async(cl_mem h_device_dest, size_t n_offset, const void *p_host_src,
		size_t n_size, cl_uint n_wait_for_event_num = 0, const cl_event *p_wait_for_event = 0,
		cl_event *p_finished_event = 0) const;

	/**
	 *	@brief copies data from host to device (enqueues write buffer operation) and returns immediately
	 *
	 *	@param[in] h_device_dest is handle of the destination memory object
	 *	@param[in] n_offset is offset to the destination object, in bytes
	 *	@param[in] p_host_src is pointer to the source buffer in host memory
	 *	@param[in] n_size is amount of data to be copied, in bytes
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] r_finished_event is reference to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_Memcpy_HtoD_Async(cl_mem h_device_dest, size_t n_offset, const void *p_host_src,
		size_t n_size, cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event,
		CCLUniqueEvent &r_finished_event) const;

	/**
	 *	@brief copies data from device to device (enqueues copy buffer operation) and waits for the operation to finish
	 *
	 *	@param[in] h_device_dest is handle of the destination memory object
	 *	@param[in] n_dest_offset is offset to the destination object, in bytes
	 *	@param[in] h_device_src is handle of the source memory object
	 *	@param[in] n_src_offset is offset to the source object, in bytes
	 *	@param[in] n_size is amount of data to be copied, in bytes
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] p_finished_event is pointer to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_Memcpy_DtoD(cl_mem h_device_dest, size_t n_dest_offset,
		cl_mem h_device_src, size_t n_src_offset, size_t n_size,
		cl_uint n_wait_for_event_num = 0, const cl_event *p_wait_for_event = 0,
		cl_event *p_finished_event = 0) const;

	/**
	 *	@brief copies data from device to device (enqueues copy buffer operation) and waits for the operation to finish
	 *
	 *	@param[in] h_device_dest is handle of the destination memory object
	 *	@param[in] n_dest_offset is offset to the destination object, in bytes
	 *	@param[in] h_device_src is handle of the source memory object
	 *	@param[in] n_src_offset is offset to the source object, in bytes
	 *	@param[in] n_size is amount of data to be copied, in bytes
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] r_finished_event is reference to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_Memcpy_DtoD(cl_mem h_device_dest, size_t n_dest_offset,
		cl_mem h_device_src, size_t n_src_offset, size_t n_size,
		cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event,
		CCLUniqueEvent &r_finished_event) const;

	/**
	 *	@brief enqueues buffer mapping operation
	 *
	 *	@param[out] r_p_buffer is reference to store the pointer to the mapped data
	 *	@param[in] h_buffer is handle of the memory object to be mapped
	 *	@param[in] b_blocking_map is blocking flag (if set, the call blocks, otherwise returns immediately)
	 *	@param[in] n_map_flags is bitfiled with mapping flags (e.g. CL_MAP_READ and/or CL_MAP_WRITE)
	 *	@param[in] n_offset is offset to the memory object, in bytes
	 *	@param[in] n_size is amount of data to be mapped, in bytes
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] p_finished_event is pointer to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note The buffer must be unmapped before it can be used by a kernel,
	 *		see n_Enqueue_UnmapMemObject().
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_MapBuffer(void *&r_p_buffer, cl_mem h_buffer, bool b_blocking_map, 
 		cl_map_flags n_map_flags, size_t n_offset, size_t n_size,
		cl_uint n_wait_for_event_num = 0, const cl_event *p_wait_for_event = 0,
		cl_event *p_finished_event = 0) const;

	/**
	 *	@brief enqueues buffer mapping operation
	 *
	 *	@param[out] r_p_buffer is reference to store the pointer to the mapped data
	 *	@param[in] h_buffer is handle of the memory object to be mapped
	 *	@param[in] b_blocking_map is blocking flag (if set, the call blocks, otherwise returns immediately)
	 *	@param[in] n_map_flags is bitfiled with mapping flags (e.g. CL_MAP_READ and/or CL_MAP_WRITE)
	 *	@param[in] n_offset is offset to the memory object, in bytes
	 *	@param[in] n_size is amount of data to be mapped, in bytes
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] r_finished_event is reference to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note The buffer must be unmapped before it can be used by a kernel,
	 *		see n_Enqueue_UnmapMemObject().
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_MapBuffer(void *&r_p_buffer, cl_mem h_buffer, bool b_blocking_map, 
 		cl_map_flags n_map_flags, size_t n_offset, size_t n_size,
		cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event,
		CCLUniqueEvent &r_finished_event) const;

	/**
	 *	@brief compares OpenCL buffer contents to an element range
	 *
	 *	@tparam CIterator is host values iterator type
	 *
	 *	@param[out] r_b_comparison_result is comparison result (true if the contents
	 *		of the buffer copmare equal to the specified elements, otherwise false)
	 *	@param[in] p_begin_it is iterator pointing to the first element to be compared (in host memory)
	 *	@param[in] p_end_it is iterator pointing to one past the last element to be compared (in host memory)
	 *	@param[in] h_buffer is handle of OpenCL buffer
	 *	@param[in] n_offset is offset to the OpenCL buffer, in bytes
	 *	@param[in] b_allow_fallback is fallback flag (if set and the buffer map operation fails, a simple
	 *		copy of the requested region is attempted; if not set (default), the function fails)
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note This assumes that the type of the buffer elements is the same as the type the iterators point to.
	 *	@note This function blocks and causes CPU / GPU synchronization (it is intended primarily for debugging).
	 *	@note This function uses clEnqueueMapBuffer().
	 */
	template <class CIterator>
	inline CLresult n_CompareBuffer(bool &r_b_comparison_result, CIterator p_begin_it,
		CIterator p_end_it, cl_mem h_buffer, size_t n_offset = 0, bool b_allow_fallback = false) const
	{
		r_b_comparison_result = false;

		_ASSERTE(p_begin_it <= p_end_it);
		typedef typename std::iterator_traits<CIterator>::value_type TValue;
		void *p_buffer_ptr;
		CLresult n_result;
		if((n_result = n_Enqueue_MapBuffer(p_buffer_ptr, h_buffer, true,
		   CL_MAP_READ, n_offset, (p_end_it - p_begin_it) * sizeof(TValue))) != CL_SUCCESS) {
			if(!b_allow_fallback)
				return n_result;

			try {
				std::vector<TValue> values(p_end_it - p_begin_it);
				if((n_result = n_Enqueue_Memcpy_DtoH(&values[0], h_buffer, n_offset,
				   (p_end_it - p_begin_it) * sizeof(TValue), 0, 0, 0)) != CL_SUCCESS)
					return n_result;

				/*const TValue *p_buffer = (const TValue*)&values[0];
				bool b_comparison = true;
				for(;p_begin_it != p_end_it; ++ p_begin_it, ++ p_buffer) {
					if(*p_buffer != *p_begin_it) {
						b_comparison = false;
						break;
					}
				}
				r_b_comparison_result = b_comparison;*/
				r_b_comparison_result = std::equal(p_begin_it, p_end_it, values.begin());

				return cl_Success;
			} catch(std::bad_alloc&) {
				return cl_Out_Of_Host_Memory;
			}
			// if failed to map, can also fall back to ordinary copy
		}
		try {
			const TValue *p_buffer = (const TValue*)p_buffer_ptr;
			/*bool b_comparison = true;
			for(;p_begin_it != p_end_it; ++ p_begin_it, ++ p_buffer) {
				if(*p_buffer != *p_begin_it) {
					b_comparison = false;
					break;
				}
			}
			r_b_comparison_result = b_comparison;*/
			r_b_comparison_result = std::equal(p_buffer, p_buffer + (p_end_it - p_begin_it), p_begin_it); // the second iterator (the last parameter) may need to be a checked iterator, the pointer sure isn't
		} catch(std::exception &r_exc) {
			n_Enqueue_UnmapMemObject(p_buffer_ptr, h_buffer); // should use RAAI (todo - wrap a buffer mapping?); ignore errors here
			throw r_exc;
		}
		return n_Enqueue_UnmapMemObject(p_buffer_ptr, h_buffer);
	}

	/**
	 *	@brief compares OpenCL buffer contents to an element range, with conversion
	 *
	 *	Example use:
	 *	@code
	 *	strcut MyPred : public std::binary_function<int, size_t, bool> {
	 *		inline bool operator ()(int a, size_t b) const
	 *		{
	 *			return a >= 0 && size_t(a) == b;
	 *		}
	 *	};
	 *
	 *	cl_mem device_data; // type int
	 *	std::vector<size_t> CPU_data; // type size_t
	 *
	 *	bool b_equal;
	 *	n_CompareBuffer(b_equal, CPU_data.begin(), CPU_data.end(), device_data, 0, MyPred());
	 *	@endcode
	 *
	 *	@tparam CIterator is host values iterator type
	 *	@tparam CComparePredicate is comparison predicate (must have public member
	 *		<tt>second_argument_type</tt>, which gives the type of the device data,
	 *		e.g. by inheriting from std::binary_function)
	 *
	 *	@param[out] r_b_comparison_result is comparison result (true if the contents
	 *		of the buffer copmare equal to the specified elements, otherwise false)
	 *	@param[in] p_begin_it is iterator pointing to the first element to be compared (in host memory)
	 *	@param[in] p_end_it is iterator pointing to one past the last element to be compared (in host memory)
	 *	@param[in] h_buffer is handle of OpenCL buffer
	 *	@param[in] n_offset is offset to the OpenCL buffer, in bytes
	 *	@param[in] pred is instance of the comparison predicate
	 *	@param[in] b_allow_fallback is fallback flag (if set and the buffer map operation fails, a simple
	 *		copy of the requested region is attempted; if not set (default), the function fails)
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note This assumes that the type of the buffer elements is the same as the type the iterators point to.
	 *	@note This function blocks and causes CPU / GPU synchronization (it is intended primarily for debugging).
	 *	@note This function uses clEnqueueMapBuffer().
	 */
	template <class CIterator, class CComparePredicate>
	inline CLresult n_CompareBuffer(bool &r_b_comparison_result, CIterator p_begin_it,
		CIterator p_end_it, cl_mem h_buffer, size_t n_offset,
		CComparePredicate pred, bool b_allow_fallback = false) const
	{
		r_b_comparison_result = false;
		_ASSERTE(p_begin_it <= p_end_it);
		typedef typename std::iterator_traits<CIterator>::value_type THostValue; // elements on host
		typedef typename CComparePredicate::second_argument_type TDeviceValue; // elements on device
		//typedef typename CComparePredicate::first_argument_type THostValue1; // should be the same as THostValue (or convertible)
		void *p_buffer_ptr;
		CLresult n_result;
		if((n_result = n_Enqueue_MapBuffer(p_buffer_ptr, h_buffer, true,
		   CL_MAP_READ, n_offset, (p_end_it - p_begin_it) * sizeof(TDeviceValue))) != CL_SUCCESS) {
			if(!b_allow_fallback)
				return n_result;

			try {
				std::vector<TDeviceValue> values(p_end_it - p_begin_it);
				if((n_result = n_Enqueue_Memcpy_DtoH(&values[0], h_buffer, n_offset,
				   (p_end_it - p_begin_it) * sizeof(TDeviceValue), 0, 0, 0)) != CL_SUCCESS)
					return n_result;

				const TDeviceValue *p_buffer = (const TDeviceValue*)p_buffer_ptr;
#if (defined(_WIN32) || defined(_WIN64)) && _MSC_VER > 1200
				r_b_comparison_result = std::equal(p_begin_it, p_end_it,
					stdext::make_checked_array_iterator(p_buffer, p_end_it - p_begin_it), pred);
				// the second iterator (the last parameter) may need to be a checked iterator, the pointer sure isn't
#else // (_WIN32 || _WIN64) && _MSC_VER > 1200
				r_b_comparison_result = std::equal(p_begin_it, p_end_it, p_buffer, pred); // no checked iterators on linux
#endif // (_WIN32 || _WIN64) && _MSC_VER > 1200
				return cl_Success;
			} catch(std::bad_alloc&) {
				return cl_Out_Of_Host_Memory;
			}
			// if failed to map, can also fall back to ordinary copy
		}
		try {
			const TDeviceValue *p_buffer = (const TDeviceValue*)p_buffer_ptr;
#if (defined(_WIN32) || defined(_WIN64)) && _MSC_VER > 1200
			r_b_comparison_result = std::equal(p_begin_it, p_end_it,
				stdext::make_checked_array_iterator(p_buffer, p_end_it - p_begin_it), pred);
			// the second iterator (the last parameter) may need to be a checked iterator, the pointer sure isn't
#else // (_WIN32 || _WIN64) && _MSC_VER > 1200
			r_b_comparison_result = std::equal(p_begin_it, p_end_it, p_buffer, pred); // no checked iterators on linux
#endif // (_WIN32 || _WIN64) && _MSC_VER > 1200
		} catch(std::exception &r_exc) {
			n_Enqueue_UnmapMemObject(p_buffer_ptr, h_buffer); // should use RAAI; ignore errors here
			throw r_exc;
		}
		return n_Enqueue_UnmapMemObject(p_buffer_ptr, h_buffer);
	}

	/**
	 *	@brief enqueues buffer unmapping operation
	 *
	 *	@param[in] p_buffer is pointer to the mapped data
	 *	@param[in] h_buffer is handle of the memory object to be mapped
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] p_finished_event is pointer to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_UnmapMemObject(void *p_buffer, cl_mem h_buffer,
		cl_uint n_wait_for_event_num = 0, const cl_event *p_wait_for_event = 0,
		cl_event *p_finished_event = 0) const;

	/**
	 *	@brief enqueues buffer unmapping operation
	 *
	 *	@param[in] p_buffer is pointer to the mapped data
	 *	@param[in] h_buffer is handle of the memory object to be mapped
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] r_finished_event is reference to store the copy finished event
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_UnmapMemObject(void *p_buffer, cl_mem h_buffer,
		cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event,
		CCLUniqueEvent &r_finished_event) const;

	/**
	 *	@brief enqueues event marker
	 *	@param[out] p_event is filled with a new event object, which
	 *		triggers once the command queue gets to this marker
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_Marker(cl_event *p_event) const;

	/**
	 *	@brief enqueues event marker
	 *	@param[out] r_event is filled with a new event object, which
	 *		triggers once the command queue gets to this marker
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_Marker(CCLUniqueEvent &r_event) const;

	/**
	 *	@brief enqueues wait for a list of events
	 *
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult n_Enqueue_WaitForEvents(cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event) const;

	/**
	 *	@brief enqueues a barrier, forcing all pending enqueued operations
	 *		to finish before subsequent operations can start
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *	@note This does not block the CPU.
	 */
	inline CLresult n_Enqueue_Barrier() const;

	/**
	 *	@brief waits for all the enqueued operations to finish
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 *	@note This does block the CPU.
	 */
	inline CLresult n_Finish() const;

	// todo - since the handle is locked out, provide also the enqueue task and enqueue native task

private:
	/**
	 *	@brief gets the command queue handle
	 *	@return Returns the associated command queue handle.
	 *	@note This is only used by CCLKernelCall.
	 */
	inline cl_command_queue h_Handle() const;
};

/**
 *	@brief unique OpenCL command queue handle
 */
typedef CCLUniqueWrapperEx<cl_command_queue, CCLCommandQueueInterface> CCLUniqueCommandqueue;

/**
 *	@brief automatically managed OpenCL handles instance for single-device applications
 *
 *	Use as follows:
 *	@code
 *	CCLUniqueInstance opencl(CL_DEVICE_TYPE_GPU);
 *	if(!opencl.b_Status())
 *		return -1;
 *	// initialize OpenCL
 *
 *	CCLUniqueProgram program(opencl, "Kernel.c", CCLUniqueProgram::from_file);
 *	if(!program.b_Status())
 *		return -1;
 *	CCLUniqueKernel vec_add_kernel = program.h_Get_Kernel("VectorAdd");
 *	// compile a program and get a kernel
 *
 *	CCLUniqueMem dp_vector_a, dp_vector_b, dp_vector_c;
 *	if(!(dp_vector_a = opencl.h_CreateBuffer(data_size * sizeof(int), CL_MEM_READ_ONLY)) ||
 *	   !(dp_vector_b = opencl.h_CreateBuffer(data_size * sizeof(int), CL_MEM_READ_ONLY)) ||
 *	   !(dp_vector_c = opencl.h_CreateBuffer(data_size * sizeof(int), CL_MEM_WRITE_ONLY)))
 *		return -1;
 *	if(opencl[0].n_Enqueue_Memcpy_HtoD_Async(dp_vector_a, 0, data_a, data_size * sizeof(int)) != CL_SUCCESS ||
 *	   opencl[0].n_Enqueue_Memcpy_HtoD_Async(dp_vector_b, 0, data_b, data_size * sizeof(int)) != CL_SUCCESS)
 *		return -1;
 *	// alloc memory buffers and copy data
 *
 *	size_t n_local_work_size = 256;
 *	size_t n_global_work_size = n_Align_Up(n_vector_length, n_local_work_size);
 *	CCLUniqueEvent kernel_finished;
 *	if(clCall1D3(opencl[0], vec_add_kernel, n_global_work_size,
 *	   n_local_work_size, dp_vector_c, dp_vector_a, dp_vector_b).GetEvent(kernel_finished) != CL_SUCCESS)
 *		return -1;
 *	// call the kernel
 *
 *	if(opencl[0].n_Enqueue_Memcpy_DtoH(data_result, dp_vector_c, 0, data_size * sizeof(int), 1, &kernel_finished) != CL_SUCCESS)
 *		return -1;
 *	// copy the data back
 *
 *	// all handles destroyed after the end of their scope automatically
 *	@endcode
 */
class CCLUniqueInstance : public CCLUniqueContext {
	friend class CCLUniqueProgram; // uses protected p_Device()

protected:
	CCLUniqueCommandqueue m_p_cmd_queue[1]; /**< @brief OpenCL command queue wrapper */
	cl_device_id m_p_device[1]; /**< @brief OpenCL device id */
	CLresult m_n_last_error; /**< @brief last OpenCL error */

public:
	/**
	 *	@brief default constructor; creates null instance
	 */
	inline CCLUniqueInstance();

	/**
	 *	@brief copy constructor; takes ownership of handles of the instance being assigned
	 *	@param[in,out] r_other is unique instance to copy from (it will lose ownership of the handles)
	 */
	inline CCLUniqueInstance(const CCLUniqueInstance &r_other);

	/**
	 *	@brief constructor; initializes OpenCL, chooses a single device and creates a command queue
	 *
	 *	@param[in] n_device_type is device type enum (e.g. CL_DEVICE_TYPE_GPU)
	 *	@param[in] b_implementation_profile_selection chooses between
	 *		"OpenCL implementation-specific" profile selection, or profile selection
	 *		based on implemented features
	 *	@param[in] b_stderr_output is stderr output flag (if set and an error occurs,
	 *		human-readable error message is printed to stderr)
	 *
	 *	@note To check last error in initialization, use n_Status() or b_Status().
	 */
	CCLUniqueInstance(int n_device_type, bool b_implementation_profile_selection = false,
		bool b_stderr_output = true, int n_queue_options = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);

	/**
	 *	@brief constructor; initializes OpenCL, chooses a single device and creates a command queue
	 *
	 *	@tparam CDeviceSelector is device selector function object (e.g. CCLUtils::TDevice_DefaultScoring)
	 *
	 *	@param[in] device_selector is instance of the device selector
	 *	@param[in] n_device_type is device type enum (default CL_DEVICE_TYPE_GPU)
	 *	@param[in] b_implementation_profile_selection chooses between
	 *		"OpenCL implementation-specific" profile selection, or profile selection
	 *		based on implemented features
	 *	@param[in] b_stderr_output is stderr output flag (if set and an error occurs,
	 *		human-readable error message is printed to stderr)
	 *
	 *	@note To check last error in initialization, use n_Status() or b_Status().
	 */
	template <class CDeviceSelector>
	CCLUniqueInstance(CDeviceSelector device_selector, int n_device_type = CL_DEVICE_TYPE_GPU,
		bool b_implementation_profile_selection = false, bool b_stderr_output = true,
		int n_queue_options = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);

	/**
	 *	@brief destructor; deletes handles to OpenCL objects
	 */
	~CCLUniqueInstance();

	/**
	 *	@brief copy operator; deletes handles to OpenCL objects, if any,
	 *		and takes ownership of handles of the instance being assigned
	 *	@param[in,out] r_other is unique instance to copy from (it will lose ownership of the handles)
	 *	@return Returns reference to this.
	 */
	inline CCLUniqueInstance &operator =(const CCLUniqueInstance &r_other);

	/**
	 *	@brief gets error state of the OpenCL initialization
	 *	@return Returns true if there was no error, otherwise returns false.
	 */
	inline bool b_Status() const;

	/**
	 *	@brief gets error state of the OpenCL initialization
	 *	@return Returns OpenCL error code (CL_SUCCESS if there was no error).
	 */
	inline CLresult n_Status() const;

	/**
	 *	@brief gets number of devices, associated with this OpenCL instance
	 *	@return Returns number of devices, associated with this OpenCL instance.
	 */
	inline size_t n_Device_Num() const;

	/**
	 *	@brief get handle to OpenCL context
	 *	@return Returns handle to OpenCL context.
	 */
	inline cl_context h_Context() const;

	/**
	 *	@brief gets id of the OpenCL device
	 *	@param[in] n_index is zero-based device index
	 *	@return Returns id of the OpenCL device.
	 */
	inline cl_device_id h_Device(size_t UNUSED(n_index)) const;

	/**
	 *	@brief gets handle to the OpenCL command queue
	 *	@param[in] n_index is zero-based device index
	 *	@return Returns handle to the OpenCL command queue associated with the selected device.
	 */
	inline cl_command_queue h_Command_Queue(size_t UNUSED(n_index)) const;

	/**
	 *	@brief gets interface of the OpenCL command queue
	 *	@param[in] n_index is zero-based device index
	 *	@return Returns interface of the OpenCL command queue associated with the selected device.
	 */
	inline const CCLCommandQueueInterface &operator [](size_t UNUSED(n_index)) const;

	/**
	 *	@brief swaps two managed OpenCL instances
	 *	@param[in,out] r_other is managed OpenCL instance to swap with
	 */
	inline void Swap(CCLUniqueInstance &r_other);

private:
	/**
	 *	@brief gets list of devices, associated with this OpenCL instance
	 *	@return Returns pointer to an array of devices, associated with this OpenCL instance.
	 *	@note This is only used by CCLUniqueProgram.
	 */
	inline const cl_device_id *p_Device() const;

protected:
	template <class CDeviceSelector>
	CLresult n_Init(CDeviceSelector device_selector, int n_device_type = CL_DEVICE_TYPE_GPU,
		int n_queue_options = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
		bool b_implementation_profile_selection = false, bool b_stderr_output = true) // unfortunately msvc 6.0 requires the function definition here, otherwise it won't compile
	{
		CLresult n_result;
		cl_platform_id h_platform;
		if(!b_implementation_profile_selection) {
			if((n_result = CCLUtils::n_Get_FullProfile_Platform(&h_platform, n_device_type)) != CL_SUCCESS) {
				if(b_stderr_output)
					fprintf(stderr, "error: failed to get OpenCL platform (%d)\n", n_result);
				return n_result;
			}
		} else
			h_platform = 0;
		// get a platform

		if(CCLUtils::n_Get_Best_DeviceId(&m_p_device[0], h_platform, n_device_type, device_selector) < 0) {
			n_result = cl_Device_Not_Found; // might cause confusion
			if(b_stderr_output)
				fprintf(stderr, "error: failed to get handle of an OpenCL device\n");
			return n_result;
		}
		// get fastest device

		{
			cl_context h_context;
			if((n_result = CCLUtils::n_OpenCL_Init(&h_context, h_platform, 1, &m_p_device[0])) != CL_SUCCESS) {
				if(b_stderr_output)
					fprintf(stderr, "error: failed to initialize OpenCL (%d)\n", n_result);
				return n_result;
			}
			CCLUniqueContext::Destroy(); // !!
			CCLUniqueContext::m_h_handle = h_context;
		}
		// create OpenCL context

		cl_int n_cl_result;
		m_p_cmd_queue[0] = clCreateCommandQueue(CCLUniqueContext::h_Get(),
			m_p_device[0], n_queue_options, &n_cl_result);
		if(n_cl_result != CL_SUCCESS) {
			if(b_stderr_output)
				fprintf(stderr, "error: failed to create OpenCL command queue (%d)\n", n_result);
			return (CLresult)n_cl_result;
		}
		// create command queue

		return cl_Success;
	}
};

/**
 *	@brief automatically managed OpenCL program
 */
class CCLUniqueProgram {
protected:
	cl_program m_h_program; /**< @brief handle to OpenCL program */
	CLresult m_n_last_result; /**< @brief last OpenCL error */
	int m_n_compile_flags; /**< @brief caching compiler flags */

	/**
	 *	@brief build from source tag
	 */
	struct TBuildFromSource_Tag {};

	/**
	 *	@brief build from compressed source tag
	 */
	struct TBuildFromCompressedSource_Tag {};

	/**
	 *	@brief build from a file tag
	 */
	struct TBuildFromFile_Tag {};

public:
	static const TBuildFromSource_Tag from_source_code; /**< @brief build from source tag */
	static const TBuildFromCompressedSource_Tag from_compressed; /**< @brief build from compressed source tag */
	static const TBuildFromFile_Tag from_file; /**< @brief build from a file tag */

public:
	/**
	 *	@brief default constructor; creates null program
	 */
	inline CCLUniqueProgram();

	/**
	 *	@brief copy constructor; takes ownership of handles of the program being assigned
	 *	@param[in,out] r_other is unique program to copy from (it will lose ownership of the handles)
	 */
	inline CCLUniqueProgram(const CCLUniqueProgram &r_other);

	/**
	 *	@brief constructor; builds a program from source
	 *
	 *	@param[in] h_context is OpenCL context to build the program for (it is built for all the devices)
	 *	@param[in] p_s_source_code is null-terminated string containing OpenCL "C" source code
	 *	@param[in] t_tag is build from source code tag (value unused)
	 *	@param[in] p_s_compiler_options is null-terminated string with compiler options (default empty)
	 *	@param[in] p_s_cache_file is file name of a file to cache the built binary (default
	 *		"%copykernelname%" automatically names the cache file and places it in the temp folder)
	 *
	 *	@note Building an OpenCL program may fail, checking b_Status() or n_Status() is recommended.
	 *	@note The cache file can be specified using a set of wildcats, see
	 *		CCLProgramCompiler::n_CompileProgram() for more details.
	 */
	CCLUniqueProgram(cl_context h_context, const char *p_s_source_code,
		TBuildFromSource_Tag UNUSED(t_tag), const char *p_s_compiler_options = "",
		const char *p_s_cache_file = "%copykernelname%");

	/**
	 *	@brief constructor; builds a program from source
	 *
	 *	@param[in] r_instance is managed OpenCL instance which specifies the OpenCL context to build
	 *		the program for (it is built for all the devices)
	 *	@param[in] p_s_source_code is null-terminated string containing OpenCL "C" source code
	 *	@param[in] t_tag is build from source code tag (value unused)
	 *	@param[in] p_s_compiler_options is null-terminated string with compiler options (default empty)
	 *	@param[in] p_s_cache_file is file name of a file to cache the built binary (default
	 *		"%copykernelname%" automatically names the cache file and places it in the temp folder)
	 *
	 *	@note Building an OpenCL program may fail, checking b_Status() or n_Status() is recommended.
	 *	@note The cache file can be specified using a set of wildcats, see
	 *		CCLProgramCompiler::n_CompileProgram() for more details.
	 */
	CCLUniqueProgram(CCLUniqueInstance &r_instance, const char *p_s_source_code,
		TBuildFromSource_Tag UNUSED(t_tag), const char *p_s_compiler_options = "",
		const char *p_s_cache_file = "%copykernelname%");

	/**
	 *	@brief constructor; builds a program from source
	 *
	 *	@param[in] h_context is OpenCL context to build the program for (it is built for all the devices)
	 *	@param[in] p_s_source_code is null-terminated string containing OpenCL "C" source code
	 *	@param[in] t_tag is build from source code tag (value unused)
	 *	@param[in] p_s_compiler_options is null-terminated string with compiler options (default empty)
	 *	@param[in] p_s_cache_file is file name of a file to cache the built binary (default
	 *		"%copykernelname%" automatically names the cache file and places it in the temp folder)
	 *
	 *	@note Building an OpenCL program may fail, checking b_Status() or n_Status() is recommended.
	 *	@note The cache file can be specified using a set of wildcats, see
	 *		CCLProgramCompiler::n_CompileProgram() for more details.
	 *	@note This is needed to correctly handle non-const source code, instead of using the
	 *		template version, which expects a decompressor object.
	 */
	inline CCLUniqueProgram(cl_context h_context, char *p_s_source_code,
		TBuildFromSource_Tag UNUSED(t_tag), const char *p_s_compiler_options = "",
		const char *p_s_cache_file = "%copykernelname%");

	/**
	 *	@brief constructor; builds a program from source
	 *
	 *	@param[in] r_instance is managed OpenCL instance which specifies the OpenCL context to build
	 *		the program for (it is built for all the devices)
	 *	@param[in] p_s_source_code is null-terminated string containing OpenCL "C" source code
	 *	@param[in] t_tag is build from source code tag (value unused)
	 *	@param[in] p_s_compiler_options is null-terminated string with compiler options (default empty)
	 *	@param[in] p_s_cache_file is file name of a file to cache the built binary (default
	 *		"%copykernelname%" automatically names the cache file and places it in the temp folder)
	 *
	 *	@note Building an OpenCL program may fail, checking b_Status() or n_Status() is recommended.
	 *	@note The cache file can be specified using a set of wildcats, see
	 *		CCLProgramCompiler::n_CompileProgram() for more details.
	 *	@note This is needed to correctly handle non-const source code, instead of using the
	 *		template version, which expects a decompressor object.
	 */
	inline CCLUniqueProgram(CCLUniqueInstance &r_instance, char *p_s_source_code,
		TBuildFromSource_Tag UNUSED(t_tag), const char *p_s_compiler_options = "",
		const char *p_s_cache_file = "%copykernelname%");

	/**
	 *	@brief constructor; builds a program from compressed source
	 *
	 *	@tparam CCompressedSourceCode is class, containing compressed source code
	 *		(such as the ones, generated by file_to_header, or one with the same interface)
	 *
	 *	@param[in] h_context is OpenCL context to build the program for (it is built for all the devices)
	 *	@param[in] r_source_code is reference to a decompressor object, containing OpenCL "C" source code
	 *	@param[in] t_tag is build from source code tag (value unused)
	 *	@param[in] p_s_compiler_options is null-terminated string with compiler options (default empty)
	 *	@param[in] p_s_cache_file is file name of a file to cache the built binary (default
	 *		"%copykernelname%" automatically names the cache file and places it in the temp folder)
	 *
	 *	@note Building an OpenCL program may fail, checking b_Status() or n_Status() is recommended.
	 *	@note The cache file can be specified using a set of wildcats, see
	 *		CCLProgramCompiler::n_CompileProgram() for more details.
	 */
	template <class CCompressedSourceCode>
	inline CCLUniqueProgram(cl_context h_context, const CCompressedSourceCode &r_source_code,
		TBuildFromCompressedSource_Tag UNUSED(t_tag), const char *p_s_compiler_options = "",
		const char *p_s_cache_file = "%copykernelname%")
		:m_h_program(0), m_n_last_result(cl_Success), m_n_compile_flags(0)
	{
		const char *p_s_source_code;
		if((p_s_source_code = r_source_code.p_Data())) { // decompress
			CCLUniqueProgram program(h_context, p_s_source_code,
				from_source_code, p_s_compiler_options, p_s_cache_file);
			Swap(program);
			if(r_source_code.b_Dynamic())
				delete[] const_cast<char*>(p_s_source_code); // free memory
		} else
			m_n_last_result = cl_Out_Of_Host_Memory; // well, ...
	}

	/**
	 *	@brief constructor; builds a program from compressed source
	 *
	 *	@tparam CCompressedSourceCode is class, containing compressed source code
	 *		(such as the ones, generated by file_to_header, or one with the same interface)
	 *
	 *	@param[in] r_instance is managed OpenCL instance which specifies the OpenCL context to build
	 *		the program for (it is built for all the devices)
	 *	@param[in] r_source_code is reference to a decompressor object, containing OpenCL "C" source code
	 *	@param[in] t_tag is build from source code tag (value unused)
	 *	@param[in] p_s_compiler_options is null-terminated string with compiler options (default empty)
	 *	@param[in] p_s_cache_file is file name of a file to cache the built binary (default
	 *		"%copykernelname%" automatically names the cache file and places it in the temp folder)
	 *
	 *	@note Building an OpenCL program may fail, checking b_Status() or n_Status() is recommended.
	 *	@note The cache file can be specified using a set of wildcats, see
	 *		CCLProgramCompiler::n_CompileProgram() for more details.
	 */
	template <class CCompressedSourceCode>
	inline CCLUniqueProgram(CCLUniqueInstance &r_instance, const CCompressedSourceCode &r_source_code,
		TBuildFromCompressedSource_Tag UNUSED(t_tag), const char *p_s_compiler_options = "",
		const char *p_s_cache_file = "%copykernelname%")
		:m_h_program(0), m_n_last_result(cl_Success), m_n_compile_flags(0)
	{
		const char *p_s_source_code;
		if((p_s_source_code = r_source_code.p_Data())) { // decompress
			CCLUniqueProgram program(r_instance, p_s_source_code,
				from_source_code, p_s_compiler_options, p_s_cache_file);
			Swap(program);
			if(r_source_code.b_Dynamic())
				delete[] const_cast<char*>(p_s_source_code); // free memory
		} else
			m_n_last_result = cl_Out_Of_Host_Memory; // well, ...
	}

	/**
	 *	@brief constructor; builds a program from a file
	 *
	 *	@param[in] r_instance is managed OpenCL instance which specifies the OpenCL context to build
	 *		the program for (it is built for all the devices)
	 *	@param[in] p_s_filename is path to a file containing OpenCL "C" source code
	 *	@param[in] t_tag is build from file tag (value unused)
	 *	@param[in] p_s_compiler_options is null-terminated string with compiler options (default empty)
	 *	@param[in] p_s_cache_file is file name of a file to cache the built binary (default
	 *		"%copykernelname%" automatically names the cache file and places it in the temp folder)
	 *
	 *	@note Building an OpenCL program may fail, checking b_Status() or n_Status() is recommended.
	 *	@note The cache file can be specified using a set of wildcats, see
	 *		CCLProgramCompiler::n_CompileProgram() for more details.
	 */
	CCLUniqueProgram(CCLUniqueInstance &r_instance, const char *p_s_filename,
		TBuildFromFile_Tag UNUSED(t_tag), const char *p_s_compiler_options = "",
		const char *p_s_cache_file = "%copykernelname%");

	/**
	 *	@brief constructor; builds a program from a file
	 *
	 *	@param[in] h_context is OpenCL context to build the program for (it is built for all the devices)
	 *	@param[in] p_s_filename is path to a file containing OpenCL "C" source code
	 *	@param[in] t_tag is build from file tag (value unused)
	 *	@param[in] p_s_compiler_options is null-terminated string with compiler options (default empty)
	 *	@param[in] p_s_cache_file is file name of a file to cache the built binary (default
	 *		"%copykernelname%" automatically names the cache file and places it in the temp folder)
	 *
	 *	@note Building an OpenCL program may fail, checking b_Status() or n_Status() is recommended.
	 *	@note The cache file can be specified using a set of wildcats, see
	 *		CCLProgramCompiler::n_CompileProgram() for more details.
	 */
	CCLUniqueProgram(cl_context h_context, const char *p_s_filename,
		TBuildFromFile_Tag UNUSED(t_tag), const char *p_s_compiler_options = "",
		const char *p_s_cache_file = "%copykernelname%");

	/**
	 *	@brief constructor; builds a program from a file, for an explicitly specified single device
	 *
	 *	@param[in] h_context is OpenCL context to build the program for (it is built for all the devices)
	 *	@param[in] p_s_filename is path to a file containing OpenCL "C" source code
	 *	@param[in] t_tag is build from file tag (value unused)
	 *	@param[in] h_device is OpenCL id of a device to build the program for
	 *	@param[in] p_s_compiler_options is null-terminated string with compiler options (default empty)
	 *	@param[in] p_s_cache_file is file name of a file to cache the built binary (default
	 *		"%copykernelname%" automatically names the cache file and places it in the temp folder)
	 *
	 *	@note Building an OpenCL program may fail, checking b_Status() or n_Status() is recommended.
	 *	@note The cache file can be specified using a set of wildcats, see
	 *		CCLProgramCompiler::n_CompileProgram() for more details.
	 */
	CCLUniqueProgram(cl_context h_context, const char *p_s_filename,
		TBuildFromFile_Tag UNUSED(t_tag), cl_device_id h_device,
		const char *p_s_compiler_options = "", const char *p_s_cache_file = "%copykernelname%");

	/**
	 *	@brief constructor; builds a program from a file, for an explicitly specified set of devices
	 *
	 *	@param[in] h_context is OpenCL context to build the program for (it is built for all the devices)
	 *	@param[in] p_s_filename is path to a file containing OpenCL "C" source code
	 *	@param[in] t_tag is build from file tag (value unused)
	 *	@param[in] n_device_num is number of devices to build the program for
	 *	@param[in] p_device is pointer to an array of OpenCL device ids to build the program for
	 *	@param[in] p_s_compiler_options is null-terminated string with compiler options (default empty)
	 *	@param[in] p_s_cache_file is file name of a file to cache the built binary (default
	 *		"%copykernelname%" automatically names the cache file and places it in the temp folder)
	 *	@param[in] n_max_cache_size is maximum size of the program binary cache (default 32)
	 *
	 *	@note Building an OpenCL program may fail, checking b_Status() or n_Status() is recommended.
	 *	@note The cache file can be specified using a set of wildcats, see
	 *		CCLProgramCompiler::n_CompileProgram() for more details.
	 */
	CCLUniqueProgram(cl_context h_context, const char *p_s_filename,
		TBuildFromFile_Tag UNUSED(t_tag), size_t n_device_num,
		const cl_device_id *p_device, const char *p_s_compiler_options = "",
		const char *p_s_cache_file = "%copykernelname%", int n_max_cache_size = 32);

	/**
	 *	@brief destructor; deletes handles to OpenCL objects
	 */
	~CCLUniqueProgram();

	/**
	 *	@brief copy operator; deletes handles to OpenCL objects, if any,
	 *		and takes ownership of handles of the program being assigned
	 *	@param[in,out] r_other is unique program to copy from (it will lose ownership of the handles)
	 *	@return Returns reference to this.
	 */
	inline CCLUniqueProgram &operator =(const CCLUniqueProgram &r_other);

	/**
	 *	@brief gets error state of the program compilation
	 *	@return Returns true if there was no error, otherwise returns false.
	 */
	inline bool b_Status() const;

	/**
	 *	@brief gets error state of the program compilation
	 *	@return Returns OpenCL error code (CL_SUCCESS if there was no error).
	 */
	inline CLresult n_Status() const;

	/**
	 *	@brief gets bitfield with information about the build
	 *	@return Returns bitfield with information about the build.
	 *	@note See also CCLProgramCompiler::n_CompileProgram() or CCLProgramCompiler::Dump_StatusWord().
	 */
	inline int n_StatusWord() const;

	/**
	 *	@brief prints human-readable information about the build to stdout
	 *	@note See also CCLProgramCompiler::Dump_StatusWord().
	 */
	inline void Dump_StatusWord() const;

	/**
	 *	@brief gets handle to a kernel of this program
	 *
	 *	@param[in] p_s_kernel_name is null-terminated string containing the kernel name (case sensitive) 
	 *	@param[out] r_n_result is OpenCL error code (CL_SUCCESS on success)
	 *
	 *	@return Returns handle to the specified kernel on success, undefined value on failure.
	 *
	 *	@note The returned handles need to be released using clReleaseKernel().
	 *		Preferably use CCLUniqueKernel, which does that automatically.
	 */
	cl_kernel h_Get_Kernel(const char *p_s_kernel_name, CLresult &r_n_result);

	/**
	 *	@brief gets handle to a kernel of this program
	 *	@param[in] p_s_kernel_name is null-terminated string containing the kernel name (case sensitive) 
	 *	@return Returns handle to the specified kernel on success, null handle on failure.
	 *	@note The returned handles need to be released using clReleaseKernel().
	 *		Preferably use CCLUniqueKernel, which does that automatically.
	 */
	cl_kernel h_Get_Kernel(const char *p_s_kernel_name);

	/**
	 *	@brief swaps two managed OpenCL programs
	 *	@param[in,out] r_other is managed OpenCL program to swap with
	 */
	inline void Swap(CCLUniqueProgram &r_other);

protected:
	CLresult n_Init(bool b_is_filename, cl_context h_context, const char *p_s_filename,
		size_t n_device_num, const cl_device_id *p_device, const char *p_s_compiler_options = "",
		const char *p_s_cache_file = "%copykernelname%", int n_max_cache_size = 32);
};

/**
 *	@brief local memory allocation size, used to specify local memory size for kernel arguments
 */
class CCLLocalMem {
protected:
	size_t m_n_size;

public:
	/**
	 *	@brief default constructor
	 *	@param[in] n_size is local memory allocation size
	 */
	inline CCLLocalMem(size_t n_size);

	/**
	 *	@brief gets local memory allocation size
	 *	@return Returns local memory allocation size.
	 */
	inline size_t n_Size() const;
};

/**
 *	@brief OpenCL driver api argument loader for kernel functions
 *	@param[in] n_offset is offset of loaded parameter (in bytes)
 *	@note This class shouldn't be used directly. Use clSetKernelArgs() macro instead.
 */
template <const int n_index>
class CCLArgLoader {
protected:
	cl_kernel m_h_func;
	CLresult m_n_result;

public:
	/**
	 *	@brief default constructor
	 *
	 *	@param[in] h_func is OpenCL function handle parameters are being set for
	 *	@param[in] n_result is result of the previous OpenCL calls
	 */
	inline CCLArgLoader(cl_kernel h_func, CLresult n_result = cl_Success);

	/**
	 *	@brief gets OpenCL result code
	 *	@return Returns result of the clSetKernelArg() calls.
	 */
	inline CLresult n_Result() const;

	/**
	 *	@brief gets count of all loaded parameters
	 *	@return Returns count of all the parameters.
	 */
	inline int n_Count() const;

	/**
	 *	@brief loads a single integer parameter n_value
	 *	@param[in] n_value is integer value (of the loaded parameter)
	 *	@return Returns loader with offset of the next parameter.
	 */
	inline CCLArgLoader<n_index + 1> operator ,(int n_value);

	/**
	 *	@brief loads a single float parameter f_value
	 *	@param[in] f_value is float value (of the loaded parameter)
	 *	@return Returns loader with offset of the next parameter.
	 */
	inline CCLArgLoader<n_index + 1> operator ,(float f_value);

	/**
	 *	@brief loads a single pointer parameter p_value
	 *	@param[in] p_value is pointer value (of the loaded parameter)
	 *	@return Returns loader with offset of the next parameter.
	 *	@note in case device pointers are used, they need to be cast
	 *		to size_t and then to void* (or use the DevPtr() macro).
	 */
	inline CCLArgLoader<n_index + 1> operator ,(cl_mem p_value);

	/**
	 *	@brief loads a single local memory parameter t_local_mem_cfg
	 *	@param[in] t_local_mem_cfg is local memory allocation size
	 *	@return Returns loader with offset of the next parameter.
	 *	@note in case device pointers are used, they need to be cast
	 *		to size_t and then to void* (or use the DevPtr() macro).
	 */
	inline CCLArgLoader<n_index + 1> operator ,(CCLLocalMem t_local_mem_cfg);
};

/**
 *	@brief calls a kernel
 */
class CCLKernelCall {
protected:
	CLresult m_n_result; /**< @brief call result */
	cl_command_queue m_h_cmd_queue; /**< @brief command queue to submit to */
	cl_kernel m_h_kernel; /**< @brief kernel to launch */
	int m_n_dimension; /**< @brief task dimension */
	size_t m_p_global_work_size[3]; /**< @brief global work size */
	size_t m_p_local_work_size[3]; /**< @brief local work size */

public:
	/**
	 *	@brief default constructor; calls a 1D kernel
	 *
	 *	@param[in] n_loader_result is result of argument loader
	 *	@param[in] h_cmd_queue is handle to command queue
	 *	@param[in] h_kernel is handle to kernel
	 *	@param[in] n_work_size_x is global work size
	 *	@param[in] n_block_size_x is thread block size
	 */
	inline CCLKernelCall(CLresult n_loader_result,
		cl_command_queue h_cmd_queue, cl_kernel h_kernel,
		size_t n_work_size_x, size_t n_block_size_x);
	/**
	 *	@brief default constructor; calls a 1D kernel
	 *
	 *	@param[in] n_loader_result is result of argument loader
	 *	@param[in] r_cmd_queue is command queue interface
	 *	@param[in] h_kernel is handle to kernel
	 *	@param[in] n_work_size_x is global work size
	 *	@param[in] n_block_size_x is thread block size
	 */
	inline CCLKernelCall(CLresult n_loader_result,
		const CCLCommandQueueInterface &r_cmd_queue, cl_kernel h_kernel,
		size_t n_work_size_x, size_t n_block_size_x);

	/**
	 *	@brief default constructor; calls a 2D kernel
	 *
	 *	@param[in] n_loader_result is result of argument loader
	 *	@param[in] h_cmd_queue is handle to command queue
	 *	@param[in] h_kernel is handle to kernel
	 *	@param[in] n_work_size_x is global work size
	 *	@param[in] n_work_size_y is global work size
	 *	@param[in] n_block_size_x is thread block size
	 *	@param[in] n_block_size_y is thread block size
	 */
	inline CCLKernelCall(CLresult n_loader_result,
		cl_command_queue h_cmd_queue, cl_kernel h_kernel,
		size_t n_work_size_x, size_t n_work_size_y,
		size_t n_block_size_x, size_t n_block_size_y);

	/**
	 *	@brief default constructor; calls a 2D kernel
	 *
	 *	@param[in] n_loader_result is result of argument loader
	 *	@param[in] r_cmd_queue is command queue interface
	 *	@param[in] h_kernel is handle to kernel
	 *	@param[in] n_work_size_x is global work size
	 *	@param[in] n_work_size_y is global work size
	 *	@param[in] n_block_size_x is thread block size
	 *	@param[in] n_block_size_y is thread block size
	 */
	inline CCLKernelCall(CLresult n_loader_result,
		const CCLCommandQueueInterface &r_cmd_queue, cl_kernel h_kernel,
		size_t n_work_size_x, size_t n_work_size_y,
		size_t n_block_size_x, size_t n_block_size_y);

	/**
	 *	@brief default constructor; calls a 3D kernel
	 *
	 *	@param[in] n_loader_result is result of argument loader
	 *	@param[in] h_cmd_queue is handle to command queue
	 *	@param[in] h_kernel is handle to kernel
	 *	@param[in] n_work_size_x is global work size
	 *	@param[in] n_work_size_y is global work size
	 *	@param[in] n_work_size_z is global work size
	 *	@param[in] n_block_size_x is thread block size
	 *	@param[in] n_block_size_y is thread block size
	 *	@param[in] n_block_size_z is thread block size
	 */
	inline CCLKernelCall(CLresult n_loader_result,
		cl_command_queue h_cmd_queue, cl_kernel h_kernel,
		size_t n_work_size_x, size_t n_work_size_y, size_t n_work_size_z,
		size_t n_block_size_x, size_t n_block_size_y, size_t n_block_size_z);

	/**
	 *	@brief default constructor; calls a 3D kernel
	 *
	 *	@param[in] n_loader_result is result of argument loader
	 *	@param[in] r_cmd_queue is command queue interface
	 *	@param[in] h_kernel is handle to kernel
	 *	@param[in] n_work_size_x is global work size
	 *	@param[in] n_work_size_y is global work size
	 *	@param[in] n_work_size_z is global work size
	 *	@param[in] n_block_size_x is thread block size
	 *	@param[in] n_block_size_y is thread block size
	 *	@param[in] n_block_size_z is thread block size
	 */
	inline CCLKernelCall(CLresult n_loader_result,
		const CCLCommandQueueInterface &r_cmd_queue, cl_kernel h_kernel,
		size_t n_work_size_x, size_t n_work_size_y, size_t n_work_size_z,
		size_t n_block_size_x, size_t n_block_size_y, size_t n_block_size_z);

	/**
	 *	@brief calls the kernel
	 *	@return Returns result of the clEnqueueNDRangeKernel() call.
	 */
	inline operator CLresult();

	/**
	 *	@brief calls the kernel after the specified events finish
	 *
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *
	 *	@return Returns result of the clEnqueueNDRangeKernel() call.
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult After(cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event);

	/**
	 *	@brief calls the kernel and creates an event object associated with the kernel finishing
	 *	@param[out] p_finished_event is pointer to store the kernel finished event
	 *	@return Returns result of the clEnqueueNDRangeKernel() call.
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult GetEvent(cl_event *p_finished_event);

	/**
	 *	@brief calls the kernel and creates an event object associated with the kernel finishing
	 *	@param[out] r_finished_event is reference to store the kernel finished event
	 *	@return Returns result of the clEnqueueNDRangeKernel() call.
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult GetEvent(CCLUniqueEvent &r_finished_event);

	/**
	 *	@brief calls the kernel after the specified events finish and
	 *		creates an event object associated with the kernel finishing
	 *
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] p_finished_event is pointer to store the kernel finished event
	 *
	 *	@return Returns result of the clEnqueueNDRangeKernel() call.
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult WithEvents(cl_uint n_wait_for_event_num = 0,
		const cl_event *p_wait_for_event = 0, cl_event *p_finished_event = 0);

	/**
	 *	@brief calls the kernel after the specified events finish and
	 *		creates an event object associated with the kernel finishing
	 *
	 *	@param[in] n_wait_for_event_num is number of events to wait for
	 *	@param[in] p_wait_for_event is pointer to the events to wait for
	 *	@param[out] r_finished_event is reference to store the kernel finished event
	 *
	 *	@return Returns result of the clEnqueueNDRangeKernel() call.
	 *
	 *	@note Use CUniqueCLEvent objects instead of cl_event directly.
	 */
	inline CLresult WithEvents(cl_uint n_wait_for_event_num,
		const cl_event *p_wait_for_event, CCLUniqueEvent &r_finished_event);
};

#if !defined(_MSC_VER) || defined(__MWERKS__) || _MSC_VER >= 1400

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of the rest of parameters, those may be int, float or cl_mem
 *	@note This requires quite recent compiler with variadic macros support.
 *		On older compilers clSetKernelArgs0() trough clSetKernelArgs16() may be used
 *		(longer argument lists are also possible, but longer macros aren't implemented).
 */
#define clSetKernelArgs(h_func, ...) (CCLArgLoader<0>(h_func), __VA_ARGS__).n_Result()

/**
 *	@def clCall1D
 *	@brief sets arguments of a 1D kernel and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... is list of the rest of parameters, those may be int, float or cl_mem
 *
 *	@note This requires quite recent compiler with variadic macros support.
 *		On older compilers clCall1D0() trough clCall1D16() may be used
 *		(longer argument lists are also possible, but longer macros aren't implemented).
 */
#define clCall1D(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall2D
 *	@brief sets arguments of a 2D kernel and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] ... is list of the rest of parameters, those may be int, float or cl_mem
 *
 *	@note This requires quite recent compiler with variadic macros support.
 *		On older compilers clCall2D0() trough clCall2D16() may be used
 *		(longer argument lists are also possible, but longer macros aren't implemented).
 */
#define clCall2D(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs((h_kernel), __VA_ARGS__), (h_cmd_queue), \
	(h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall3D
 *	@brief sets arguments of a 3D kernel and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] ... is list of the rest of parameters, those may be int, float or cl_mem
 *
 *	@note This requires quite recent compiler with variadic macros support.
 *		On older compilers clCall3D0() trough clCall3D16() may be used
 *		(longer argument lists are also possible, but longer macros aren't implemented).
 */
#define clCall3D(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs((h_kernel), __VA_ARGS__), (h_cmd_queue), \
	(h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), (n_block_size_x), \
	(n_block_size_y), (n_block_size_z))

#endif // !defined(_MSC_VER) || defined(__MWERKS__) || _MSC_VER >= 1400

/**
 *	@def DevPtrParam
 *	@brief helper for clSetKernelArg
 *	@param[in] dp_ptr is device pointer (cl_mem type)
 *
 *	Use in the following way:
 *	@code
 *	cl_kernel h_kernel; // some function
 *	int n_index = 0; // some parameter index
 *	int n_err_num; // memory allocation result
 *	cl_mem dp_pointer = clCreateBuffer(h_gpu_context, CL_MEM_READ_WRITE, 1024, NULL, &n_err_num); // device-side pointer
 *
 *	clSetKernelArg(h_kernel, n_index, DevPtrParam(dp_pointer))
 *	@endcode
 *
 *	@deprecated This was provided only for compatibility
 *		with the CUDA header, and is now deprecated.
 */
#define DevPtrParam(dp_ptr) sizeof(cl_mem), (void*)&(dp_ptr)

// inlines below

#include "ClUtils.inl"

#endif // !__CUDA_UTILS_INCLUDED
