/*
	Copyright (c) 2002, Thomas Kurschel
	

	Part of Radeon accelerant
		
	Hardware accelerator management
	
	All accelerator commands go through the following steps:
	- accelerant adds command to CP buffer and updates CP write pointer
	- CP fetches command and sends it to MicroController
	- MicroController instructs 2D unit to execute command
	- 2D unit draws into 2D Destination Cache (DC)
	- 2D Destination Cache is drained to frame buffer
	
	Whenever a token is required by BeOS, a command is queued to write
	the timestamp into Scratch Register 0. I haven't fully understand
	when and how coherancy is assured by Radeon, so I assume the following:
	- when the timestamp is written, all previous commands have been issued,
	  i.e. they are read and executed by the microcontroller
	- to make sure previously issued 2D commands have been finished,
	  a WAIT_2D_IDLECLEAN command is inserted before the scratch register 
	  write
	- to flush the destination cache, a RB2D_DC_FLUSH_ALL command is
	  issued before the wait; I hope that the wait command also waits for
	  the flush command, but I'm not sure about that
	  
	Remains the cache coherency problem. It you can set various bits in
	DSTCACHE_MODE register to assure that, but first I don't really understand
	them, and second I'm not sure which other caches/FIFO may make trouble.
	Especially, Be wants to use CPU and CP accesses in parallel. Hopefully,
	they don't interfere.
	
	I know that the PAINT_MULTI commands makes trouble if you change the
	ROP to something else: CPU writes produce garbage in frame buffer for the
	next couple of accesses. Resetting the ROP to a simply copy helps, but 
	I'm not sure what happens with concurrent CPU accesses to other areas 
	of the frame buffer.
*/


#include "radeon_accelerant.h"
#include "generic.h"
#include "rbbm_regs.h"
#include "GlobalData.h"
#include "mmio.h"
#include "CP.h"

static engine_token radeon_engine_token = { 1, B_2D_ACCELERATION, NULL };

// public function: return number of hardware engine
uint32 ACCELERANT_ENGINE_COUNT(void) 
{
	// hm, is there *any* card sporting more then 
	// one hardware accelerator???
	return 1;
}

// write current sync token into CP stream;
// we instruct the CP to flush all kind of cache first to not interfere
// with subsequent host writes
static void writeSyncToken( accelerator_info *ai )
{
	// don't write token if it hasn't changed since last write
	if( ai->si->engine.count == ai->si->engine.written )
		return;

	if( ai->si->acc_dma ) {
		START_IB();
	
		// flush pending data
		WRITE_IB_REG( RADEON_RB2D_DSTCACHE_CTLSTAT, RADEON_RB2D_DC_FLUSH_ALL );
		
		// make sure commands are finished
		WRITE_IB_REG( RADEON_WAIT_UNTIL, RADEON_WAIT_2D_IDLECLEAN |
			RADEON_WAIT_3D_IDLECLEAN | RADEON_WAIT_HOST_IDLECLEAN );
			
		// write scratch register
		WRITE_IB_REG( RADEON_SCRATCH_REG0, ai->si->engine.count );
		
		ai->si->engine.written = ai->si->engine.count;
		
		SUBMIT_IB();
	} else {
		Radeon_WaitForFifo( ai, 2 );
		OUTREG( ai->regs, RADEON_RB2D_DSTCACHE_CTLSTAT, RADEON_RB2D_DC_FLUSH_ALL);
		OUTREG( ai->regs, RADEON_WAIT_UNTIL, RADEON_WAIT_2D_IDLECLEAN |
		   RADEON_WAIT_3D_IDLECLEAN |
		   RADEON_WAIT_HOST_IDLECLEAN);
		ai->si->engine.written = ai->si->engine.count;
	}
}

// public function: acquire engine for future use
//	capabilites - required 2D/3D capabilities of engine, ignored
//	max_wait - maximum time we want to wait (in ms?), ignored
//	st - when engine has been acquired, wait for this sync token
//	et - (out) specifier of the engine acquired
status_t ACQUIRE_ENGINE( uint32 capabilities, uint32 max_wait, 
	sync_token *st, engine_token **et ) 
{
	shared_info *si = ai->si;
	
	SHOW_FLOW0( 4, "" );
	
	(void)capabilities;
	(void)max_wait;
	
	ACQUIRE_BEN( si->engine.lock)

	// wait for sync
	if (st) 
		SYNC_TO_TOKEN( st );

	*et = &radeon_engine_token;
	return B_OK;
}

// public function: release accelerator
//	et - engine to release
//	st - (out) sync token to be filled out
status_t RELEASE_ENGINE( engine_token *et, sync_token *st ) 
{
	shared_info *si = ai->si;

	SHOW_FLOW0( 4, "" );
	
	// fill out sync token
	if (st) {
		writeSyncToken( ai );
		
		st->engine_id = et->engine_id;
		st->counter = si->engine.count;
	}

	RELEASE_BEN( ai->si->engine.lock )
	
	return B_OK;
}

// public function: wait until engine is idle 
// ??? which engine to wait for? Is there anyone using this function?
//     is lock hold?
void WAIT_ENGINE_IDLE(void) 
{
	SHOW_FLOW0( 4, "" );
	
	Radeon_WaitForIdle( ai, false );
}

// public function: get sync token
//	et - engine to wait for
//	st - (out) sync token to be filled out
status_t GET_SYNC_TOKEN( engine_token *et, sync_token *st )
{
	shared_info *si = ai->si;

	SHOW_FLOW0( 4, "" );
	
	writeSyncToken( ai );
	
	st->engine_id = et->engine_id;
	st->counter = si->engine.count;
	
	SHOW_FLOW( 4, "got counter=%d", si->engine.count );
	
	return B_OK;
}

// this is the same as the corresponding kernel function
void Radeon_Spin( uint32 delay )
{
	bigtime_t start_time;
	
	start_time = system_time();
	
	while( system_time() - start_time < delay )
		;
}

// public: sync to token
//	st - token to wait for
status_t SYNC_TO_TOKEN( sync_token *st ) 
{
	shared_info *si = ai->si;
	bigtime_t start_time, sample_time;
	
	SHOW_FLOW0( 4, "" );
	
	if ( !ai->si->acc_dma )
	{
		Radeon_WaitForFifo( ai, 64 );
		Radeon_WaitForIdle( ai, false );
		return B_OK;
	}
	
	start_time = system_time();

	while( 1 ) {
		SHOW_FLOW( 4, "passed counter=%d", 
			((uint32 *)(ai->mapped_memory[si->cp.feedback.mem_type].data + si->cp.feedback.scratch_mem_offset))[0] );
			//si->cp.scratch.ptr[0] );
		
		// a bit nasty: counter is 64 bit, but we have 32 bit only,
		// this is a tricky calculation to handle wrap-arounds correctly
		if( (int32)(
			((uint32 *)(ai->mapped_memory[si->cp.feedback.mem_type].data + si->cp.feedback.scratch_mem_offset))[0]
			//si->cp.scratch.ptr[0] 
			- st->counter) >= 0 )
			return B_OK;
		/*if( (int32)(INREG( ai->regs, RADEON_SCRATCH_REG0 ) - st->counter) >= 0 )
			return B_OK;*/
		
		// commands have not been finished;
		// this is a good time to free completed buffers as we have to
		// busy-wait anyway
		ACQUIRE_BEN( si->cp.lock );
		Radeon_FreeIndirectBuffers( ai );
		RELEASE_BEN( si->cp.lock );

		sample_time = system_time();
		
		if( sample_time - start_time > 100000 )
			break;

		// use exponential fall-off
		// in the beginning do busy-waiting, later on we let thread sleep
		// the micro-spin is used to reduce PCI load
		if( sample_time - start_time > 5000 ) 
			snooze( (sample_time - start_time) / 10 );
		else
			Radeon_Spin( 1 );
	} 

	// we could reset engine now, but caller doesn't need to acquire
	// engine before calling this function, so we either reset it
	// without sync (ouch!) or acquire engine first and risk deadlocking
	SHOW_ERROR( 0, "Failed waiting for token %d (active token: %d)",
		st->counter, /*INREG( ai->regs, RADEON_SCRATCH_REG0 )*/
		((uint32 *)(ai->mapped_memory[si->cp.feedback.mem_type].data + si->cp.feedback.scratch_mem_offset))[0] );
		//si->cp.scratch.ptr[0] );
		
	Radeon_ResetEngine( ai );
		
	return B_ERROR;
}