#include "fw_incl.h"
#include "manager.h"
//#include <wal.h>
//#include <wal_bce_jk.h>
/* include WAL - worker abstraction layer */
#include "../api/12-mb-petalinux/libwal/wal.h"
#include "../api/12-mb-petalinux/libwal/wal_bce_dma.h"
/* include description of hardware IP core BCE worker */
#include "../api/12-mb-petalinux/libbce_config_step5/bce_dma_config.h"

//#include "bce_fp01_1x1_plbw.h"
//#define _GNU_SOURCE
#include <pthread.h>
#include <time.h>
#include <omp.h>

//#include <stdlib.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <errno.h>
#include <unistd.h>
#include <stdio.h>



pthread_mutex_t global_lock=PTHREAD_MUTEX_INITIALIZER;
#define MMULT_TRIG 0x82 
// operation code, which is being send to worker unit upon start of operation

typedef struct table{
	wal_worker_t *wrk;
	unsigned int state;
	int fw0;
	int fw1;
	int useFw;

}table_entry;

#define WORKER_COUNT 2
unsigned int table_trigger=0;
table_entry worker_table[WORKER_COUNT]; //table for accesing worker data structures
const unsigned int *fw[20]; //table of firmwares

WAL_REGISTER_WORKER(worker, BCE_DMA_GENERIC_4D, bce_dma_cfgtable, 0, 1, 0);
WAL_REGISTER_WORKER(worker2, BCE_DMA_GENERIC_4D, bce_dma_cfgtable, 1, 1, 0);
////////////////////////

//#define DISABLE_SYNC
#define FLUSH_DCACHE

#define FW_COUNT 2

//#define LOG(...) printf(__VA_ARGS__);
#define LOG(...) do {} while (0)
#define LOGFW(...) do {} while (0)

#define PASSIVE_WAIT() nanosleep(NULL, NULL)

#ifdef DISABLE_SYNC
#define pthread_mutex_lock(a) do {} while (0)
#define pthread_mutex_unlock(a) do {} while (0)
#endif

// gcc for microblaze does not allow local aligned arrays
#define DEF_ALIGNED_ARRAY(name, size) \
	float unaligned_##name[(size)+1]; \
	float *name; \
	if (((int)unaligned_##name & 0x7) != 0) \
		name = unaligned_##name + 1; \
	else \
		name = unaligned_##name

static inline
void flush_cache(
	void* addr,
	size_t size)
{
	const size_t dcache_line_len = 4;

	for(size_t i = 0; i < size; i += dcache_line_len)
		__asm volatile (
			"wdc %0, r0;"
			:
			: "r" (addr + i * dcache_line_len)
			: "memory"
		);
}


//#define MEASURE_TIME
/*clock_t g_clock_data_out;
clock_t g_clock_data_in;
clock_t g_clock_calc;*/

struct timespec begints, endts;
//unsigned long long begin = 0, end = 0;
unsigned long long g_clock_data_out;
unsigned long long g_clock_data_in;
unsigned long long g_clock_calc;

#define NSEC_IN_SEC 1000000000LL

struct timespec *TimeSpecDiff(struct timespec *ts1, struct timespec *ts2)
{
  static struct timespec ts;
  ts.tv_sec = ts1->tv_sec - ts2->tv_sec;
  ts.tv_nsec = ts1->tv_nsec - ts2->tv_nsec;
  if (ts.tv_nsec < 0) {
    ts.tv_sec--;
    ts.tv_nsec += NSEC_IN_SEC;
  }
  return &ts;
}


#ifdef MEASURE_TIME
//#define CLOCK_START() clock_t _clock_start = clock();
//#define CLOCK_END(counter) clock_t _clock_finish = clock(); counter += _clock_finish - _clock_start;
#   define CLOCK_REALTIME	0
/* High-resolution timer from the CPU.  */
#   define CLOCK_PROCESS_CPUTIME_ID	2
/* Thread-specific CPU-time clock.  */
#   define CLOCK_THREAD_CPUTIME_ID	3

#define CLOCK_START() clock_gettime(CLOCK_REALTIME, &begints);
#define CLOCK_END(counter) \
		do {\
			clock_gettime(CLOCK_REALTIME, &endts); \
			struct timespec* tmpts = TimeSpecDiff(&endts, &begints); \
			counter += tmpts->tv_sec * NSEC_IN_SEC + tmpts->tv_nsec; \
		} while (0)

#else
#define CLOCK_START() do {} while (0)
#define CLOCK_END(counter) do {} while (0)
#endif

/*function to initialize data to in worker
 * wrk_index -ID of worker
 * xh -id of mem bank
 * x_inc -increment rate
 * *x_data -pointer to data
 * xNN -trip coun
 * mem -index of memory bank
 * const- conastant value
 */
void fw_data_init(unsigned int wrk_index,unsigned int xh, unsigned int mem,unsigned int x_inc,float *x_data,unsigned int add_mod,unsigned int mult_mod, unsigned int xNN,  float constant){
	//signed int mult_mod_x=(signed)mult_mod;
	//signed int add_mod_x=(signed)add_mod;
	unsigned int bank=0;
	if(mem==0){
		bank=WAL_BCE_JSY_DMEM_A;
	}else if(mem==1){
		bank=WAL_BCE_JSY_DMEM_B;
	}else if(mem==2){
		bank=WAL_BCE_JSY_DMEM_C;
	}else if(mem==3){
		bank=WAL_BCE_JSY_DMEM_D;
	}
	if(x_inc==4){
		;;
	}
	// printf("DATA INIT\n");

	if(x_inc==0){
		//xNN=1;
		x_data=&constant;

	}

	//printf("Sending data from 0x%x (ddaddr) to bank %d (bramidx) address 0x%x (bramaddr) of size %d words with increment %d.\n", (int)&x_data[add_mod], bank, xh, xNN*mult_mod, x_inc);

	//printf("\n\ndata: %f \n \n",x_data[add_mod] );
	//unsigned int align_offset = 0;

	pthread_mutex_lock(&global_lock);

	if (((int)&x_data[add_mod] & 0x7) == 0x4)
	{
		if (x_inc == 0)
		{
			// fix unaligned constants and transfer size
			//printf("Unaligned constant: %f\n", x_data[add_mod]);

			DEF_ALIGNED_ARRAY(tmp, 8);
			tmp[0] = constant;

			if (wal_dma_configure(worker_table[wrk_index].wrk, 0, &(tmp), 0, bank, xh, /*xNN*mult_mod*/2))
			{
				fprintf(stderr, "Error: fw_data_init - wal_dma_configure failed!\n");
				return;
			}
		}
		else
		{
			printf("Warning: Trying to copy unaligned data from address 0x%x.\n", (int)&x_data[add_mod]);
		}
	  // printing value may block execution
	}
	else
	{
		if (wal_dma_configure(worker_table[wrk_index].wrk, 0, &(x_data[add_mod]), 0, bank, xh, xNN*mult_mod))
		{
			fprintf(stderr, "Error: fw_data_init - wal_dma_configure failed!\n");
			return;
		}
	}

	CLOCK_START();

	if (wal_dma_start(worker_table[wrk_index].wrk, 0, WAL_DMA_REQ_RD)) return;


	// FIXME: aktivni cekani
	while (wal_dma_isbusy(worker_table[wrk_index].wrk, (0x01))!=0){
		//pthread_yield();
		PASSIVE_WAIT();
		//;//printf("transfer\n");
	}

	CLOCK_END(g_clock_data_out);

	pthread_mutex_unlock(&global_lock);

}

/* get data from worker after finished operation
 * wrk_index -ID of worker
 * mem -index of memory
 * xh -id of memory bank
 * *x_data -pointer to result store
 * xNN -data count
 */
void fw_data_get(unsigned int wrk_index,unsigned int xh ,unsigned int mem,unsigned int x_inc, float *x_data, unsigned int add_mod, unsigned int mult_mod,unsigned int xNN ){
	if(mem==0){
		mem=WAL_BCE_JSY_DMEM_A;
	}else if(mem==1){
		mem=WAL_BCE_JSY_DMEM_B;
	}else if(mem==2){
		mem=WAL_BCE_JSY_DMEM_C;
	}else if(mem==3){
		mem=WAL_BCE_JSY_DMEM_D;
	}
	//unsigned int i;
	//printf("DATA GET\n");
	//printf("DataExtraction: addmod: %d, mult mod %d, xNN %d \n ", add_mod, mult_mod, xNN);

	// FIXME: is it necessary to wait for all transfers?
	pthread_mutex_lock(&global_lock);

	LOG("Retrieving data from 0x%x (ddaddr) to bank %d (bramidx) address 0x%x (bramaddr) of size %d words with increment %d.\n",
			  (int)&x_data[add_mod], mem, xh, xNN*mult_mod, x_inc);


	if (wal_dma_configure(worker_table[wrk_index].wrk, 0, &(x_data[add_mod]), 0, mem, xh, xNN*mult_mod)) return; //data

	CLOCK_START();

	if (wal_dma_start(worker_table[wrk_index].wrk, 0, WAL_DMA_REQ_WR)) return;


	while (wal_dma_isbusy(worker_table[wrk_index].wrk, (0x01))!=0){
		//pthread_yield();
		PASSIVE_WAIT();
		;//printf("transfer_back\n");
	}

#ifdef FLUSH_DCACHE
	flush_cache(&x_data[add_mod], xNN*mult_mod + 16);
#endif

	CLOCK_END(g_clock_data_in);

	LOGFW("Releasing worker %d, 0x%x\n", wrk_index, worker_table[wrk_index].wrk);
	worker_table[wrk_index].state=1;
	pthread_mutex_unlock(&global_lock);

}


/*
 * initializing worker table
 */
void fw_init_table(){
	fw_reg(fw); //register firmwares into fw list
	if(table_trigger!=0){
		return;
	}
	++table_trigger;
	int i;
	for(i=0;i<WORKER_COUNT;++i){
		if(i==0){
			worker_table[i].wrk=worker;
		}
		if(i==1){
			worker_table[i].wrk=worker2;
		}
		worker_table[i].state=1;
		if(i>1){
			worker_table[i].state=0;

		}
		worker_table[i].fw0=-1;
		worker_table[i].fw1=-1;
		worker_table[i].useFw=-1;
	}
///////////////////////////////
	if (wal_init_worker(worker_table[0].wrk))
		{
			fprintf(stderr, "Error: could not init worker 0x%x\n", worker_table[0].wrk);
			return ;
		}
	if (wal_init_worker(worker_table[1].wrk))
		{
			fprintf(stderr, "Error: could not init worker 0x%x\n", worker_table[1].wrk);
			return ;
		}

	/*for(int i=0; i< FW_COUNT;i++){



		   if (wal_set_firmware(worker_table[i].wrk, WAL_PBID_P0, fw[0], -1))
		   {
			   fprintf(stderr, "Error: could not set firmware for worker 0x%x\n", worker_table[i].wrk);
			   return ;
		   }
	       worker_table[i].fw0=0;


		   if (wal_set_firmware(worker_table[i].wrk, WAL_PBID_P1, fw[1], -1))
		   		   {
		   			   fprintf(stderr, "Error: could not set firmware for worker 0x%x\n", worker_table[i].wrk);
		   			   return ;
		   		   }
	   	   worker_table[i].fw1=1;




	}*/

	for(int i=0; i< FW_COUNT;i++){
		   if (wal_set_firmware(worker_table[i].wrk, WAL_PBID_P0, fw[i], -1))
		   {
			   fprintf(stderr, "Error: could not set firmware for worker 0x%x\n", worker_table[i].wrk);
			   return ;
		   }
	       worker_table[i].fw0=i;

	}

	if (wal_reset_worker(worker_table[0].wrk))
		{
			fprintf(stderr, "Error: could not reset worker 0x%x\n", worker_table[0].wrk);
			return -2;
		}
	if (wal_reset_worker(worker_table[1].wrk))
		{
			fprintf(stderr, "Error: could not reset worker 0x%x\n", worker_table[1].wrk);
			return -2;
		}
///////////////////////////////

}


/*
 * initialize firmware, find free worker, reserve it
 * fw_id -ID of firmware
 * return -ID of free initialized worker
 */

unsigned int fw_init_fw(unsigned int fw_id){

	int i;
	//pthread_mutexattr_t attr;
	//pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK);
	pthread_mutex_init(&global_lock,NULL);
	pthread_mutex_unlock(&global_lock);

	for(i=0;i<WORKER_COUNT;++i){
		LOGFW("Worker[%d]: fw0: %d, fw1: %d, state: %d, wrk 0x%x\n",
				i, (int)worker_table[i].fw0, (int)worker_table[i].fw1, (int)worker_table[i].state, (int)worker_table[i].wrk);
	}

	// find worker with required firmware
	int wrk_id=-1; //mod needed - check for included firmwares in unused workers

try_again:
	pthread_mutex_lock(&global_lock);
	for(i=0;i<WORKER_COUNT;++i)
	{
		if(worker_table[i].state==1 ) // is worker free?
		{
			//worker_table[i].state=0;
			wrk_id=i;
			if(worker_table[i].fw0==fw_id)
			{
				LOGFW("Found firmware %d, wrk 0x%x, thread %d of %d.\n", fw_id, (int)worker_table[i].wrk, omp_get_thread_num(), omp_get_num_threads());
				worker_table[i].state=0;
				worker_table[i].useFw=0;
				pthread_mutex_unlock(&global_lock);
				return wrk_id;
			}
			else if(worker_table[i].fw1==fw_id){
				LOGFW("Found firmware %d, wrk 0x%x, thread %d of %d.\n", fw_id, (int)worker_table[i].wrk, omp_get_thread_num(), omp_get_num_threads());
				worker_table[i].state=0;
				worker_table[i].useFw=1;
				pthread_mutex_unlock(&global_lock);
				return wrk_id;
			}/*else
			{
				if (worker_table[i].fw0 != -1)
				{
					// this free worker has already different firmware, try another one
					;
				}
				else
				{
					LOGFW("Found free worker %d, 0x%x\n", i, (int)worker_table[i].wrk);
					worker_table[i].state=0;
					break;

				}
			}*/

		}

		/*if(i==WORKER_COUNT)
		{
			i=0;
		}*/
	}

	pthread_mutex_unlock(&global_lock);
	PASSIVE_WAIT();
	goto try_again;

	if(i==WORKER_COUNT)
	{
		fprintf(stderr, "Fatal error: could not find free worker for firmware %d!\n", fw_id);
		exit(1);
	}

	// not found - prepare one
	pthread_mutex_lock(&global_lock);
	if (wal_init_worker(worker_table[wrk_id].wrk))
	{
		fprintf(stderr, "Error: could not init worker 0x%x\n", worker_table[wrk_id].wrk);
		return -1;
	}

	LOGFW("Setting firmware %d, wrk 0x%x, thread %d of %d.\n", fw_id, (int)worker_table[wrk_id].wrk, omp_get_thread_num(), omp_get_num_threads());
	//fflush(stdout);

	if (wal_set_firmware(worker_table[wrk_id].wrk, WAL_PBID_P0, fw[fw_id], -1))
	{
		fprintf(stderr, "Error: could not set firmware for worker 0x%x\n", worker_table[wrk_id].wrk);
		return -2;
	}

	if (wal_reset_worker(worker_table[wrk_id].wrk))
	{
		fprintf(stderr, "Error: could not reset worker 0x%x\n", worker_table[wrk_id].wrk);
		return -2;
	}

	//printf("cislo fw:%d\n",fw_id);
	worker_table[wrk_id].fw0=fw_id;
	pthread_mutex_unlock(&global_lock);
	return wrk_id;
}

/* start operation in worker
 * wrk_index - index of worker
 */
void fw_op_start(unsigned int wrk_index){
	int ret;
	uint32_t get;
	pthread_mutex_lock(&global_lock);

	CLOCK_START();
    if(worker_table[wrk_index].useFw == 0){
	   wal_start_operation(worker_table[wrk_index].wrk, WAL_PBID_P0); //start operation //FIXME 2nd firmware
    }else if(worker_table[wrk_index].useFw == 1){
    	wal_start_operation(worker_table[wrk_index].wrk, WAL_PBID_P1);
    }
	if (wal_mb2pb(worker_table[wrk_index].wrk, MMULT_TRIG)) return;      //trigger op in firmware

	//printf("Starting operation %d for worker %d, thread %d of %d.\n", worker_table[wrk_index].fw0, wrk_index, omp_get_thread_num(), omp_get_num_threads());
	pthread_mutex_unlock(&global_lock);

	while (wal_is_busy(worker_table[wrk_index].wrk)!=0) {  //wait for result
		//pthread_yield();
		PASSIVE_WAIT();
		;
	}
	CLOCK_END(g_clock_calc);

	pthread_mutex_lock(&global_lock);
	ret = wal_pb2mb(worker_table[wrk_index].wrk, &get);
	pthread_mutex_unlock(&global_lock);
	//printf("ret:%d\n",get);
}


void ravac_init_profile_time()
{
	g_clock_calc = 0;
	g_clock_data_in = 0;
	g_clock_data_out = 0;
}


void ravac_print_profile_time()
{
	printf("BCE times: calculation: %f (%lld), data out: %f (%lld), data in: %f (%lld).\n",
	  (double)(g_clock_calc) / NSEC_IN_SEC, g_clock_calc,
	  (double)(g_clock_data_in) / NSEC_IN_SEC, g_clock_data_in,
	  (double)(g_clock_data_out) / NSEC_IN_SEC, g_clock_data_out
	);

}
