(r12600)

Data Structures
struct	FLA_Obj_gpu_struct
struct	FLASH_Queue_variables
struct	FLASH_Queue_variables
Typedefs
typedef struct FLA_Obj_gpu_struct	FLA_Obj_gpu
typedef struct FLASH_Queue_variables	FLASH_Queue_vars
Functions
void	FLASH_Queue_exec (void)
void	FLASH_Queue_init_tasks (void *arg)
void	FLASH_Queue_wait_enqueue (FLASH_Task t, void arg)
FLASH_Task *	FLASH_Queue_wait_dequeue (int queue, int cache, void *arg)
FLASH_Task *	FLASH_Queue_wait_dequeue_block (int queue, int cache, void *arg)
void	FLASH_Queue_update_cache (FLASH_Task t, void arg)
void	FLASH_Queue_update_cache_block (FLA_Obj obj, int cache, FLA_Bool output, void *arg)
void	FLASH_Queue_prefetch (int cache, void *arg)
void	FLASH_Queue_prefetch_block (FLA_Obj obj)
FLASH_Task *	FLASH_Queue_work_stealing (int queue, void *arg)
void	FLASH_Queue_create_gpu (int thread, void *arg)
void	FLASH_Queue_destroy_gpu (int thread, void *arg)
FLA_Bool	FLASH_Queue_exec_gpu (FLASH_Task t, void arg)
FLA_Bool	FLASH_Queue_check_gpu (FLASH_Task t, void arg)
FLA_Bool	FLASH_Queue_check_block_gpu (FLA_Obj obj, int thread, void *arg)
void	FLASH_Queue_update_gpu (FLASH_Task t, void input_arg, void output_arg, void arg)
void	FLASH_Queue_update_block_gpu (FLA_Obj obj, void *buffer_gpu, int thread, void arg)
void	FLASH_Queue_mark_gpu (FLASH_Task t, void arg)
void	FLASH_Queue_invalidate_block_gpu (FLA_Obj obj, int thread, void *arg)
void	FLASH_Queue_flush_block_gpu (FLA_Obj obj, int thread, void *arg)
void	FLASH_Queue_flush_gpu (int thread, void *arg)
void	FLASH_Queue_exec_parallel (void *arg)
void *	FLASH_Queue_exec_parallel_function (void *arg)
FLASH_Task *	FLASH_Task_update_dependencies (FLASH_Task t, void arg)
FLASH_Task *	FLASH_Task_update_binding (FLASH_Task t, FLASH_Task r, void *arg)
void	FLASH_Task_free_parallel (FLASH_Task t, void arg)
void	FLASH_Queue_exec_simulation (void *arg)
int	RCCE_acquire_lock (int)
int	RCCE_release_lock (int)
double	RCCE_wtime (void)
int	RCCE_ue (void)
void	Synch_all ()

Typedef Documentation

typedef struct FLA_Obj_gpu_struct FLA_Obj_gpu

typedef struct FLASH_Queue_variables FLASH_Queue_vars

Function Documentation

FLA_Bool FLASH_Queue_check_block_gpu	(	FLA_Obj	obj,
		int	thread,
		void *	arg
	)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.

Referenced by FLASH_Queue_check_gpu().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int k;
   dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
   FLA_Bool r_val = TRUE;

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif

   // Locate the position of the block on the GPU.
   for ( k = 0; k < gpu_n_blocks; k++ )
      if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
         break;

   if ( k < gpu_n_blocks )
   {
      // Request this block if it is dirty.
      if ( !args->gpu[thread * gpu_n_blocks + k].clean )
      {
         args->gpu[thread * gpu_n_blocks + k].request = TRUE;

         r_val = FALSE;
      }
   }

   // Check the victim block.
   if ( obj.base == args->victim[thread].obj.base )
      r_val = FALSE;

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif

   return r_val;
}

FLA_Bool FLASH_Queue_check_gpu	(	FLASH_Task *	t,
		void *	arg
	)

References FLA_Obj_view::base, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_block_gpu(), FLASH_Queue_get_num_threads(), FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::output_arg, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec_gpu().

{
   int i, j, k;
   int thread        = t->thread;
   int n_input_args  = t->n_input_args;
   int n_output_args = t->n_output_args;
   int n_threads     = FLASH_Queue_get_num_threads();
   FLA_Bool r_val    = TRUE;
   FLA_Bool t_val;
   FLA_Bool duplicate;
   FLA_Obj  obj;

   // Check the input and output arguments on the GPUs.
   for ( i = 0; i < n_input_args + n_output_args; i++ )
   {
      // Check for duplicate blocks.
      duplicate = FALSE;

      // Find the correct input or output argument.
      if ( i < n_input_args )
      {
         obj = t->input_arg[i];

         for ( j = 0; j < n_output_args && !duplicate; j++ )
         {
            if ( obj.base == t->output_arg[j].base )
               duplicate = TRUE;
         }
         
         for ( j = 0; j < i && !duplicate; j++ )
         {
            if ( obj.base == t->input_arg[j].base )
               duplicate = TRUE;
         }
      }
      else
      {
         obj = t->output_arg[i - n_input_args];

         for ( j = 0; j < i - n_input_args && !duplicate; j++ )
         {
            if ( obj.base == t->output_arg[j].base )
               duplicate = TRUE;
         }        
      }

      // If the block has not been processed before.
      if ( !duplicate )
      {     
         // Macroblock is used.
         if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
         {
            dim_t    jj, kk;
            dim_t    m    = FLA_Obj_length( obj );
            dim_t    n    = FLA_Obj_width( obj );
            dim_t    cs   = FLA_Obj_col_stride( obj );
            FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
            
            // Clear each block in macroblock.
            for ( jj = 0; jj < n; jj++ )
            {
               for ( kk = 0; kk < m; kk++ )
               {
                  obj = *( buf + jj * cs + kk );
                  
                  t_val = TRUE;
                  
                  // Check to see if the block is dirty on another GPU.
                  for ( k = 0; k < n_threads && t_val; k++ )
                     if ( k != thread )
                        t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg );
                  
                  r_val = r_val && t_val;
               }
            }
         }
         else
         {
            t_val = TRUE;
            
            // Check to see if the block is dirty on another GPU.
            for ( k = 0; k < n_threads && t_val; k++ )
               if ( k != thread )
                  t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg );
            
            r_val = r_val && t_val;
         }
      }
   }

   return r_val;
}

void FLASH_Queue_create_gpu	(	int	thread,
		void *	arg
	)

References FLASH_Queue_variables::block_size, FLA_Obj_gpu_struct::buffer_gpu, FLASH_Queue_variables::datatype, FLASH_Queue_alloc_gpu(), FLASH_Queue_bind_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), and FLASH_Queue_variables::gpu.

Referenced by FLASH_Queue_exec_parallel_function().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int i;
   dim_t gpu_n_blocks     = FLASH_Queue_get_gpu_num_blocks();
   dim_t block_size       = args->block_size;
   FLA_Datatype datatype  = args->datatype;

   // Exit if not using GPU.
   if ( !FLASH_Queue_get_enabled_gpu() )
      return;
   
   // Bind thread to GPU.
   FLASH_Queue_bind_gpu( thread );

   // Allocate the memory on the GPU for all the blocks a priori.
   for ( i = 0; i < gpu_n_blocks; i++ )
      FLASH_Queue_alloc_gpu( block_size, datatype, &(args->gpu[thread * gpu_n_blocks + i].buffer_gpu) );
   
   return;
}

void FLASH_Queue_destroy_gpu	(	int	thread,
		void *	arg
	)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLASH_Queue_free_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, and FLA_Obj_gpu_struct::obj.

Referenced by FLASH_Queue_exec_parallel_function().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int i;
   dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
   FLA_Obj_gpu gpu_obj;

   // Exit if not using GPU.
   if ( !FLASH_Queue_get_enabled_gpu() )
      return;

   // Examine every block left on the GPU.
   for ( i = 0; i < gpu_n_blocks; i++ )
   {
      gpu_obj = args->gpu[thread * gpu_n_blocks + i];

      // Flush the blocks that are dirty.
      if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
         FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );

      // Free the memory on the GPU for all the blocks.
      FLASH_Queue_free_gpu( gpu_obj.buffer_gpu );
   }

   return;
}

void FLASH_Queue_exec ( void )

Referenced by FLASH_Queue_end().

{
   int          n_tasks    = FLASH_Queue_get_num_tasks();
   int          n_threads  = FLASH_Queue_get_num_threads();
   int          n_queues;
   int          n_caches;
   int          size;
   int          i;
   dim_t        block_size = FLASH_Queue_get_block_size();
   double       dtime;

   FLA_Lock*    run_lock;
   FLA_Lock*    dep_lock;
   FLA_Lock*    war_lock;
   FLA_Lock*    cac_lock;

   FLA_Obj*     cache;
   FLA_Obj*     prefetch;
   FLASH_Queue* wait_queue;

#ifdef FLA_ENABLE_GPU
#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock*    gpu_lock;
#endif
   FLA_Obj_gpu* gpu;
   FLA_Obj_gpu* victim;
   FLA_Obj_gpu* gpu_log;
   dim_t        gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
#endif

   // All the necessary variables for the SuperMatrix mechanism.
   FLASH_Queue_vars args;

   // If the queue is empty, return early.
   if ( n_tasks == 0 )
      return;

#ifndef FLA_ENABLE_MULTITHREADING
   // Turn off work stealing in simulation mode.
   FLASH_Queue_set_work_stealing( FALSE );
#endif

   // Query the number of user set threads per queue.
   n_queues = FLASH_Queue_get_cores_per_queue();

   // Default user setting for number of threads.
   if ( n_queues <= 0 )
   {
      // Do not use data affinity or work stealing when caching is enabled.
      if ( FLASH_Queue_get_caching() )
      {
         FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_NONE );
         FLASH_Queue_set_work_stealing( FALSE );
      }
      
      // Do not use work stealing when data affinity is enabled.
      if ( FLASH_Queue_get_data_affinity() != FLASH_QUEUE_AFFINITY_NONE )
      {
         FLASH_Queue_set_work_stealing( FALSE );
      }
      
      // Allocate different arrays if using data affinity.
      n_queues = ( FLASH_Queue_get_data_affinity() == 
                   FLASH_QUEUE_AFFINITY_NONE &&
                   !FLASH_Queue_get_work_stealing() ? 1 : n_threads );      
   }
   else
   {
      // Set the number of queues.
      n_queues = n_threads / n_queues;

      // Must use at least one queue.
      if ( n_queues == 0 )
         n_queues = 1;

      if ( n_queues == 1 )
      {
         // Turn off all multiple queue implementations.
         FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_NONE );
         FLASH_Queue_set_work_stealing( FALSE );
      }
      else
      {
         // Use 2D data affinity for multiple queues if nothing is set.
         if ( FLASH_Queue_get_data_affinity() == FLASH_QUEUE_AFFINITY_NONE &&
              !FLASH_Queue_get_work_stealing() )
         {
            FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC );
         }
      }
   }

   // Determine the number of caches.
   n_caches = n_threads / FLASH_Queue_get_cores_per_cache();

   args.n_queues = n_queues;
   args.n_caches = n_caches;

#ifdef FLA_ENABLE_MULTITHREADING
   // Allocate memory for array of locks.
   run_lock = ( FLA_Lock* ) FLA_malloc( n_queues  * sizeof( FLA_Lock ) );
   dep_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) );
   war_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) );
   cac_lock = ( FLA_Lock* ) FLA_malloc( n_caches  * sizeof( FLA_Lock ) );

   args.run_lock = run_lock;
   args.dep_lock = dep_lock;
   args.war_lock = war_lock;
   args.cac_lock = cac_lock;

   // Initialize the all lock.
   FLA_Lock_init( &(args.all_lock) );
   
   // Initialize the run lock for thread i.
   for ( i = 0; i < n_queues; i++ )
   {
      FLA_Lock_init( &(args.run_lock[i]) );
   }

   // Initialize the dep and war locks for thread i.
   for ( i = 0; i < n_threads; i++ )
   {
      FLA_Lock_init( &(args.dep_lock[i]) );
      FLA_Lock_init( &(args.war_lock[i]) );
   }

   // Initialize the cac locks for each cache.
   for ( i = 0; i < n_caches; i++ )
   {
      FLA_Lock_init( &(args.cac_lock[i]) );
   }
#endif

   // The number of blocks that can fit into the cache on each thread.
   if ( block_size == 0 )
      size = MIN_CACHE_BLOCKS;
   else
      size = max( FLASH_Queue_get_cache_size() / block_size, MIN_CACHE_BLOCKS);
   args.size = size;

   // Allocate memory for cache, prefetch buffer, and waiting queue.
   cache = ( FLA_Obj* ) FLA_malloc( size * n_caches * sizeof( FLA_Obj ) );
   prefetch = ( FLA_Obj* ) FLA_malloc( size * sizeof( FLA_Obj ) );
   wait_queue = ( FLASH_Queue* ) FLA_malloc( n_queues * sizeof( FLASH_Queue ));
   
   args.cache = cache;
   args.prefetch = prefetch;
   args.wait_queue = wait_queue;

   // Initialize cache, prefetch buffer, and waiting queue.
   for ( i = 0; i < size * n_caches; i++ )
      args.cache[i].base = NULL;

   for ( i = 0; i < size; i++ )
      args.prefetch[i].base = NULL;

   for ( i = 0; i < n_queues; i++ )
   {
      args.wait_queue[i].n_tasks = 0;
      args.wait_queue[i].head = NULL;
      args.wait_queue[i].tail = NULL;
   }

   // Initialize the aggregate task counter.
   args.pc = 0;

#ifdef FLA_ENABLE_GPU
#ifdef FLA_ENABLE_MULTITHREADING
   // Allocate and initialize the gpu locks.
   gpu_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) );
   args.gpu_lock = gpu_lock;   

   for ( i = 0; i < n_threads; i++ )
      FLA_Lock_init( &(args.gpu_lock[i]) );
#endif
   // Allocate and initialize GPU software cache.
   gpu = ( FLA_Obj_gpu* ) FLA_malloc( gpu_n_blocks * n_threads * sizeof( FLA_Obj_gpu ) );
   args.gpu = gpu;

   for ( i = 0; i < gpu_n_blocks * n_threads; i++ )
   {
      args.gpu[i].obj.base   = NULL;
      args.gpu[i].buffer_gpu = NULL;
      args.gpu[i].clean      = TRUE;
      args.gpu[i].request    = FALSE;
   }

   victim = ( FLA_Obj_gpu* ) FLA_malloc( n_threads * sizeof( FLA_Obj_gpu ) );
   args.victim = victim;

   for ( i = 0; i < n_threads; i++ )
      args.victim[i].obj.base = NULL;

   gpu_log = ( FLA_Obj_gpu* ) FLA_malloc( gpu_n_blocks * n_threads * sizeof( FLA_Obj_gpu ) );
   args.gpu_log = gpu_log;
#endif

   // Initialize tasks with critical information.
   FLASH_Queue_init_tasks( ( void* ) &args );
   
   // Display verbose output before free all tasks. 
   if ( FLASH_Queue_get_verbose_output() )
      FLASH_Queue_verbose_output();
   
   // Start timing the parallel execution.
   dtime = FLA_Clock();

#ifdef FLA_ENABLE_MULTITHREADING
   // Parallel Execution!
   FLASH_Queue_exec_parallel( ( void* ) &args );
#else
   // Simulation!
   FLASH_Queue_exec_simulation( ( void* ) &args );
#endif
   
   // End timing the parallel execution.
   dtime = FLA_Clock() - dtime;
   FLASH_Queue_set_parallel_time( dtime );

#ifdef FLA_ENABLE_MULTITHREADING   
   // Destroy the locks.
   FLA_Lock_destroy( &(args.all_lock) );

   for ( i = 0; i < n_queues; i++ )
   {
      FLA_Lock_destroy( &(args.run_lock[i]) );
   }

   for ( i = 0; i < n_threads; i++ )
   {
      FLA_Lock_destroy( &(args.dep_lock[i]) );
      FLA_Lock_destroy( &(args.war_lock[i]) );
   }

   for ( i = 0; i < n_caches; i++ )
   {
      FLA_Lock_destroy( &(args.cac_lock[i]) );
   }

   // Deallocate memory.
   FLA_free( run_lock );
   FLA_free( dep_lock );
   FLA_free( war_lock );
   FLA_free( cac_lock );
#endif

   FLA_free( cache );
   FLA_free( prefetch );
   FLA_free( wait_queue );

#ifdef FLA_ENABLE_GPU
#ifdef FLA_ENABLE_MULTITHREADING
   for ( i = 0; i < n_threads; i++ )
      FLA_Lock_destroy( &(args.gpu_lock[i]) );
   FLA_free( gpu_lock );
#endif
   FLA_free( gpu );
   FLA_free( victim );
   FLA_free( gpu_log );
#endif

   // Reset values for next call to FLASH_Queue_exec().
   FLASH_Queue_reset();

   return;
}

FLA_Bool FLASH_Queue_exec_gpu	(	FLASH_Task *	t,
		void *	arg
	)

References FLA_Obj_view::base, FLASH_Task_s::enabled_gpu, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_malloc(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_exec_task_gpu(), FLASH_Queue_flush_block_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_mark_gpu(), FLASH_Queue_update_gpu(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::hit, FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::output_arg, FLASH_Task_s::queue, FLASH_Queue_variables::run_lock, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec_parallel_function().

{
   void** input_arg;
   void** output_arg;

   if ( t == NULL )
      return TRUE;

   // If not using the GPU, then execute on CPU.
   if ( !FLASH_Queue_get_enabled_gpu() )
   {
      FLASH_Queue_exec_task( t );

      return TRUE;
   }

   // Check if all the operands are ready and up to date.
   if ( !FLASH_Queue_check_gpu( t, arg ) )
   {
      FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
      int queue = t->queue;
      t->hit = FALSE;

#ifdef FLA_ENABLE_MULTITHREADING
      FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
#endif
      // Reenqueue the task if the blocks are not all flushed.
      FLASH_Queue_wait_enqueue( t, arg );

#ifdef FLA_ENABLE_MULTITHREADING
      FLA_Lock_release( &(args->run_lock[queue]) ); // R ***    
#endif

      return FALSE;
   }

   // If GPU is enabled, but the task is not supported for GPU execution.
   if ( !t->enabled_gpu )
   {
      int i, j, k;
      int thread        = t->thread;
      int n_input_args  = t->n_input_args;
      int n_output_args = t->n_output_args;
      int n_threads     = FLASH_Queue_get_num_threads();
      FLA_Bool duplicate;
      FLA_Obj  obj;

      // Check the blocks on each GPU.
      for ( k = 0; k < n_threads; k++ )
      {
         // Check the input and output arguments on the GPUs.
         for ( i = 0; i < n_input_args + n_output_args; i++ )
         {
            // Check for duplicate blocks.
            duplicate = FALSE;
            
            // Find the correct input or output argument.
            if ( i < n_input_args )
            {
               obj = t->input_arg[i];
               
               for ( j = 0; j < n_output_args && !duplicate; j++ )
               {
                  if ( obj.base == t->output_arg[j].base )
                     duplicate = TRUE;
               }
               
               for ( j = 0; j < i && !duplicate; j++ )
               {
                  if ( obj.base == t->input_arg[j].base )
                     duplicate = TRUE;
               }
            }
            else
            {
               obj = t->output_arg[i - n_input_args];
               
               for ( j = 0; j < i - n_input_args && !duplicate; j++ )
               {
                  if ( obj.base == t->output_arg[j].base )
                     duplicate = TRUE;
               }        
            }
            
            // If the block has not been processed before.
            if ( !duplicate )
            {     
               // Macroblock is used.
               if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
               {
                  dim_t    jj, kk;
                  dim_t    m    = FLA_Obj_length( obj );
                  dim_t    n    = FLA_Obj_width( obj );
                  dim_t    cs   = FLA_Obj_col_stride( obj );
                  FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
                  
                  // Clear each block in macroblock.
                  for ( jj = 0; jj < n; jj++ )
                  {
                     for ( kk = 0; kk < m; kk++ )
                     {
                        obj = *( buf + jj * cs + kk );
                        
                        // Flush the block to main memory if it is on the GPU.
                        if ( k == thread )
                           FLASH_Queue_flush_block_gpu( obj, k, arg );
                        
                        // Invalidate output block on all GPUs.
                        if ( i >= n_input_args )
                           FLASH_Queue_invalidate_block_gpu( obj, k, arg );
                     }
                  }
               }
               else
               {
                  // Flush the block to main memory if it is on the GPU.
                  if ( k == thread )
                     FLASH_Queue_flush_block_gpu( obj, k, arg );
                  
                  // Invalidate output block on all GPUs.
                  if ( i >= n_input_args )
                     FLASH_Queue_invalidate_block_gpu( obj, k, arg );
               }
            }
         }
      }

      // Execute the task on CPU instead of GPU.
      FLASH_Queue_exec_task( t );

      return TRUE;
   }

   // Gather the pointers for the data on the GPU.
   input_arg = ( void** ) FLA_malloc( t->n_input_args * sizeof( void* ) );
   output_arg = ( void** ) FLA_malloc( t->n_output_args * sizeof( void* ) );

   // Bring all the blocks to GPU.
   FLASH_Queue_update_gpu( t, input_arg, output_arg, arg );

   // Execute the task on GPU.
   FLASH_Queue_exec_task_gpu( t, input_arg, output_arg );

   // Mark all the output blocks as dirty.
   FLASH_Queue_mark_gpu( t, arg );

   // Free memory.
   FLA_free( input_arg );
   FLA_free( output_arg );

   return TRUE;
}

void FLASH_Queue_exec_parallel ( void * arg )

References FLA_Check_pthread_create_result(), FLA_Check_pthread_join_result(), FLA_free(), FLA_malloc(), FLASH_Queue_exec_parallel_function(), and FLASH_Queue_get_num_threads().

Referenced by FLASH_Queue_exec().

{
   int   i;
   int   n_threads = FLASH_Queue_get_num_threads();
   void* (*thread_entry_point)( void* );

   // Allocate the thread structures array. Here, an array of FLASH_Thread
   // structures of length n_threads is allocated and the fields of each
   // structure set to appropriate values.
   FLASH_Thread* thread = ( FLASH_Thread* ) FLA_malloc( n_threads * sizeof( FLASH_Thread ) );

   // Initialize the thread structures array.
   for ( i = 0; i < n_threads; i++ )
   {
      // Save the thread's identifier.
      thread[i].id = i;

      // Save the pointer to the necessary variables with the thread.
      thread[i].args = arg;

      // The pthread object, if it was even compiled into the FLASH_Thread
      // structure, will be initialized by the pthread implementation when we
      // call pthread_create() and does not need to be touched at this time.
   }

   // Determine which function to send threads to.
   thread_entry_point = FLASH_Queue_exec_parallel_function;

#if FLA_MULTITHREADING_MODEL == FLA_OPENMP

   // An OpenMP parallel for region spawns n_threads threads. Each thread
   // executes the work function with a different FLASH_Thread argument.
   // An implicit synchronization point exists at the end of the curly
   // brace scope.
   #pragma omp parallel for \
           private( i ) \
           shared( thread, n_threads, thread_entry_point ) \
           schedule( static, 1 ) \
           num_threads( n_threads )
   for ( i = 0; i < n_threads; ++i )
   {
      thread_entry_point( ( void* ) &thread[i] );
   }

#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS

   // Create each POSIX thread needed in addition to the main thread.
   for ( i = 1; i < n_threads; i++ )
   {
      int pthread_e_val;

      // Create thread i with default attributes.
      pthread_e_val = pthread_create( &(thread[i].pthread_obj),
                                      NULL,
                                      thread_entry_point,
                                      ( void* ) &thread[i] );

#ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
      FLA_Error e_val = FLA_Check_pthread_create_result( pthread_e_val );
      FLA_Check_error_code( e_val );
#endif
   }

   // The main thread is assigned the role of thread 0. Here we manually
   // execute it as a worker thread.
   thread_entry_point( ( void* ) &thread[0] );

   // Wait for non-main threads to finish.
   for ( i = 1; i < n_threads; i++ )
   {
      // These two variables are declared local to this for loop since this
      // is the only place they are needed, and since they would show up as
      // unused variables if FLA_MULTITHREADING_MODEL == FLA_PTHREADS.
      // Strangely, the Intel compiler produces code that results in an
      // "unaligned access" runtime message if thread_status is declared as
      // an int. Declaring it as a long or void* appears to force the
      // compiler (not surprisingly) into aligning it to an 8-byte boundary.
      int   pthread_e_val;
      void* thread_status;

      // Wait for thread i to invoke its respective pthread_exit().
      // The return value passed to pthread_exit() is provided to us
      // via status, if one was given.
      pthread_e_val = pthread_join( thread[i].pthread_obj,
                                    ( void** ) &thread_status );
      
#ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
      FLA_Error e_val = FLA_Check_pthread_join_result( pthread_e_val );
      FLA_Check_error_code( e_val );
#endif
   }
   
#endif

   FLA_free( thread );

   return;
}

void* FLASH_Queue_exec_parallel_function ( void * arg )

References FLASH_Thread_s::args, FLASH_Task_s::cache, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_create_gpu(), FLASH_Queue_destroy_gpu(), FLASH_Queue_exec_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_flush_gpu(), FLASH_Queue_get_caching(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_work_stealing(), FLASH_Task_free_parallel(), FLASH_Task_update_dependencies(), FLASH_Thread_s::id, FLASH_Queue_variables::n_queues, RCCE_acquire_lock(), RCCE_release_lock(), RCCE_ue(), and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec(), and FLASH_Queue_exec_parallel().

{
   FLASH_Queue_vars* args;   
   int           i;
   int           queue;
   int           cache;
   int           n_tasks   = FLASH_Queue_get_num_tasks();
   int           n_threads = FLASH_Queue_get_num_threads();
   int           n_cores   = FLASH_Queue_get_cores_per_cache();
   FLA_Bool      caching   = FLASH_Queue_get_caching();
   FLA_Bool      stealing  = FLASH_Queue_get_work_stealing();
   FLA_Bool      committed = TRUE;
   FLA_Bool      condition = TRUE;
   FLA_Bool      enabled   = FALSE;
   FLA_Bool      available;
   FLASH_Task*   t = NULL;
   FLASH_Task*   r = NULL;
   FLASH_Thread* me;
   //cpu_set_t     cpu_set;

   // Interpret the thread argument as what it really is--a pointer to an
   // FLASH_Thread structure.
   me = ( FLASH_Thread* ) arg;

   // Extract the variables from the current thread.
   args = ( FLASH_Queue_vars* ) me->args;

   // Figure out the id of the current thread.
   i = me->id;

   // Set the CPU affinity; We want the current thread i to run only on CPU i.
   //CPU_ZERO( &cpu_set );
   //CPU_SET( i, &cpu_set );
   //sched_setaffinity( syscall( __NR_gettid ), sizeof(cpu_set_t), &cpu_set );

   // Determine to which queue this thread belongs.
   queue = i / ( n_threads / args->n_queues );

   // Determine to which cache this thread belongs.
   cache = i / n_cores;

#ifdef FLA_ENABLE_GPU
   // Create memory on GPU.
   FLASH_Queue_create_gpu( i, ( void* ) args );

   // Save whether GPUs are enabled.
   enabled = FLASH_Queue_get_enabled_gpu();

   // Only use each GPU as its own cache when GPUs are enabled.
   if ( enabled )
      cache = i;
#endif

   // Prefetch blocks into the cache before execution.
   if ( caching && !enabled && i % n_cores == 0 )
      FLASH_Queue_prefetch( cache, ( void* ) args );

   // Loop until all the tasks have committed.
   while ( condition )
   {
#ifdef FLA_ENABLE_GPU
      // Check to see if any blocks on GPU need to be flushed.
      FLASH_Queue_flush_gpu( i, ( void* ) args );
#endif      

      // Dequeue a task if there has not been one binded to thread.
      if ( r == NULL )
      {
         FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
         
         // Obtain task to execute.
         t = FLASH_Queue_wait_dequeue( queue, cache, ( void* ) args );
         
         FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
      }
      else
      {
         // Obtain the binded task.
         t = r;
         r = NULL;
      }

      // Dequeued a task from the waiting queue.
      available = ( t != NULL );

      if ( available )
      {
         // Save the thread and cache that executes the task.
         t->thread = i;
         t->cache = cache;

         if ( caching && !enabled )
         {
            // Update the current state of the cache.
            FLASH_Queue_update_cache( t, ( void* ) args );
         }        

#ifdef FLA_ENABLE_GPU         
         // Execute the task on GPU.
         committed = FLASH_Queue_exec_gpu( t, ( void* ) args );
#else
         // Execute the task.
         FLASH_Queue_exec_task( t );
#endif

         // If the task has executed or not.
         if ( committed )
         {
            // Update task dependencies.
            r = FLASH_Task_update_dependencies( t, ( void* ) args );
            
            // Free the task once it executes in parallel.
            FLASH_Task_free_parallel( t, ( void* ) args );
         }
      }
      else
      {
         if ( stealing )
         {
            // Perform work stealing if there are no tasks to dequeue.
            r = FLASH_Queue_work_stealing( queue, ( void* ) args );
         }
      }

      FLA_Lock_acquire( &(args->all_lock) ); // A ***

      // Increment program counter.
      if ( available && committed )
         args->pc++;

      // Terminate loop.
      if ( args->pc >= n_tasks )
         condition = FALSE;
      
      FLA_Lock_release( &(args->all_lock) ); // A ***
   }

#ifdef FLA_ENABLE_GPU
   // Destroy and flush contents of GPU back to main memory.
   FLASH_Queue_destroy_gpu( i, ( void* ) args );
#endif
   
#if FLA_MULTITHREADING_MODEL == FLA_PTHREADS
   // If this is a non-main thread, then exit with a zero (normal) error code.
   // The main thread cannot call pthread_exit() because this routine never
   // returns. The main thread must proceed so it can oversee the joining of
   // the exited non-main pthreads.
   if ( i != 0 )
      pthread_exit( ( void* ) NULL );
#endif

   return ( void* ) NULL;
}

void FLASH_Queue_exec_simulation ( void * arg )

References FLASH_Task_s::cache, FLASH_Task_s::dep_arg_head, FLA_free(), FLA_malloc(), FLASH_Queue_exec_task(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_free(), FLASH_Task_s::n_dep_args, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Task_s::name, FLASH_Dep_s::next_dep, FLASH_Queue_variables::pc, FLASH_Dep_s::task, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int           i, j;
   int           queue;
   int           cache;
   int           n_stages  = 0;
   int           n_queues  = args->n_queues;
   int           n_tasks   = FLASH_Queue_get_num_tasks();
   int           n_threads = FLASH_Queue_get_num_threads();
   int           n_cores   = FLASH_Queue_get_cores_per_cache();
   FLASH_Verbose verbose   = FLASH_Queue_get_verbose_output();
   FLASH_Task*   task;
   FLASH_Task*   t;
   FLASH_Dep*    d;

   // An array to hold tasks to be executed during of simulation.
#ifdef FLA_ENABLE_WINDOWS_BUILD
   FLASH_Task** exec_array = ( FLASH_Task** ) FLA_malloc( n_threads * sizeof( FLASH_Task* ) );
#else
   FLASH_Task* exec_array[n_threads];
#endif

   for ( i = 0; i < n_threads; i++ )
   {
      // Initialize all exec_array to NULL.
      exec_array[i] = NULL;

      // Prefetch blocks into the cache before execution.
      if ( i % n_cores == 0 )
         FLASH_Queue_prefetch( i, arg );
   }

   // Loop until all the tasks have committed.
   while ( args->pc < n_tasks )
   {
      for ( i = 0; i < n_threads; i++ )
      {
         // Update waiting queue with ready tasks.
         t = exec_array[i];
         
         if ( t != NULL )
         {
            // Check each dependent task.
            d = t->dep_arg_head;
            
            for ( j = 0; j < t->n_dep_args; j++ )
            {
               task = d->task;              
               task->n_ready--;
               
               // Place newly ready tasks on waiting queue.
               if ( task->n_ready == 0 )
               {
                  FLASH_Queue_wait_enqueue( task, arg );
               }
               
               // Go to the next dep.
               d = d->next_dep;
            }

            // Free the task.
            FLASH_Task_free( t );
         }
      }      
      
      n_stages++;
      if ( !verbose )
         printf( "%7d", n_stages );
      
      // Move ready tasks from the waiting queue to execution queue.
      for ( i = 0; i < n_threads; i++ )
      {
         // Determine to which queue this thread belongs.
         queue = i / ( n_threads / n_queues );

         // Determine to which cache this thread belongs.
         cache = i / n_cores;

         // Dequeue a task.
     t = FLASH_Queue_wait_dequeue( queue, cache, arg );

         // Save the task for execution.
         exec_array[i] = t;
         
         if ( t != NULL )
         {
            // Save the thread and cache that executes the task.
            t->thread = i;
            t->cache = cache;

            // Increment program counter.
            args->pc++;
         }
      }

      // Execute independent tasks.
      for ( i = 0; i < n_threads; i++ )
      {
         t = exec_array[i];
         FLASH_Queue_update_cache( t, arg );
         FLASH_Queue_exec_task( t );
         
         if ( !verbose )
            printf( "%7s", ( t == NULL ? "     " : t->name ) );

         // Free the task if this is the last stage.
         if ( args->pc == n_tasks && t != NULL )
            FLASH_Task_free( t );
      }
      
      if ( !verbose ) 
         printf( "\n" );
   }

   if ( !verbose )
      printf( "\n" );

#ifdef FLA_ENABLE_WINDOWS_BUILD
   FLA_free( exec_array );
#endif

   return;
}

void FLASH_Queue_flush_block_gpu	(	FLA_Obj	obj,
		int	thread,
		void *	arg
	)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_gpu().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int k;
   dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
   FLA_Bool transfer = FALSE;
   FLA_Obj_gpu gpu_obj;

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif

   // Locate the position of the block on the GPU.
   for ( k = 0; k < gpu_n_blocks; k++ )
      if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
         break;
   
   // The block is owned by the GPU.
   if ( k < gpu_n_blocks )
   {
      // Save the block that will be flushed.
      gpu_obj = args->gpu[thread * gpu_n_blocks + k];
      
      // If the block is dirty, then flush it.
      if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
         transfer = TRUE;
   }

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif

   // Exit early if a flush is not required.
   if ( !transfer )
      return;

   // Flush the block outside the critical section.
   FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );   

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif
   
   // Locate the position of the block on the GPU.
   for ( k = 0; k < gpu_n_blocks; k++ )
      if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
         break;
   
   if ( k < gpu_n_blocks )
   {
      // Update the bits for the flushed block.
      args->gpu[thread * gpu_n_blocks + k].clean   = TRUE;
      args->gpu[thread * gpu_n_blocks + k].request = FALSE;
   }
   
#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif

   return;
}

void FLASH_Queue_flush_gpu	(	int	thread,
		void *	arg
	)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Queue_variables::gpu_log, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_parallel_function().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int i, k;
   dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
   int n_transfer = 0;
   FLA_Obj_gpu gpu_obj;
   
   // Exit if not using GPU.
   if ( !FLASH_Queue_get_enabled_gpu() )
      return;

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif

   for ( k = 0; k < gpu_n_blocks; k++ )
   {
      // Save the block that might be flushed.
      gpu_obj = args->gpu[thread * gpu_n_blocks + k];
      
      // Flush the block if it is dirty and requested.
      if ( gpu_obj.obj.base != NULL && !gpu_obj.clean && gpu_obj.request )
      {
         // Save the block for data transfer outside the critical section.
         args->gpu_log[thread * gpu_n_blocks + n_transfer] = gpu_obj;
         n_transfer++;
      }
   }

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif

   // Exit early if a flush is not required.   
   if ( n_transfer == 0 )
      return;
   
   // Flush the block outside the critical section.
   for ( i = 0; i < n_transfer; i++ )
   {
      gpu_obj = args->gpu_log[thread * gpu_n_blocks + i];
      FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
   }

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif

   // Update the bits for each block that is flushed.
   for ( i = 0; i < n_transfer; i++ )
   {
      // Locate the position of the block on the GPU.
      for ( k = 0; k < gpu_n_blocks; k++ )
         if ( args->gpu_log[thread * gpu_n_blocks + i].obj.base == 
              args->gpu[thread * gpu_n_blocks + k].obj.base )
            break;
      
      if ( k < gpu_n_blocks )
      {
         // The block is now clean.
         args->gpu[thread * gpu_n_blocks + k].clean   = TRUE;
         args->gpu[thread * gpu_n_blocks + k].request = FALSE;
      }
   }
   
#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif

   return;
}

void FLASH_Queue_init_tasks ( void * arg )

References FLA_Obj_view::base, FLASH_Queue_variables::block_size, FLASH_Queue_variables::datatype, FLASH_Task_s::dep_arg_head, FLA_is_owner(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_datatype_size(), FLA_Obj_elemtype(), FLA_Obj_free_buffer_task(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_data_affinity(), FLASH_Queue_get_head_task(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_tail_task(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::func, FLASH_Task_s::height, FLASH_Task_s::input_arg, FLA_Obj_struct::m_index, FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_index, FLASH_Task_s::n_input_args, FLASH_Task_s::n_macro_args, FLASH_Task_s::n_output_args, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Task_s::n_war_args, FLA_Obj_struct::n_write_blocks, FLASH_Dep_s::next_dep, FLASH_Task_s::next_task, FLASH_Task_s::output_arg, FLASH_Queue_variables::prefetch, FLASH_Task_s::prev_task, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::size, FLASH_Dep_s::task, and FLASH_Queue_variables::task_queue.

Referenced by FLASH_Queue_exec().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int            i, j, k;
   int            n_tasks    = FLASH_Queue_get_num_tasks();
   int            n_queues   = args->n_queues;
   int            n_prefetch = 0;
   int            n_ready    = 0;
   int            length     = 0;
   int            width      = 0;
   int            height     = 0;
   int            size       = args->size;
   FLASH_Data_aff data_aff   = FLASH_Queue_get_data_affinity();
   FLASH_Task*    t;
   FLASH_Dep*     d;
   FLA_Obj        obj;

#ifdef FLA_ENABLE_GPU
   dim_t block_size      = 0;
   FLA_Datatype datatype = FLA_FLOAT;
   dim_t datatype_size   = FLA_Obj_datatype_size( datatype );
#endif

   // Find the 2D factorization of the number of threads.
   if ( data_aff == FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC )
   {
      int sq_rt = 0;
      while ( sq_rt * sq_rt <= n_queues ) sq_rt++;
      sq_rt--;
      while ( n_queues % sq_rt != 0 ) sq_rt--;
      length = n_queues / sq_rt;
      width  = sq_rt;     
   }

   // Grab the tail of the task queue.
   t = FLASH_Queue_get_tail_task();

   for ( i = n_tasks - 1; i >= 0; i-- )
   {
      // Determine data affinity.
      if ( data_aff == FLASH_QUEUE_AFFINITY_NONE )
      { // No data affinity
         t->queue = 0;
      }
      else 
      {
         // Use the first output block to determine data affinity.
         obj = t->output_arg[0];
         
         // Use the top left block of the macroblock.
         if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
            obj = *FLASH_OBJ_PTR_AT( obj );

         if ( data_aff == FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC )
         { // Two-dimensional block cyclic           
            t->queue = ( obj.base->m_index % length ) +
               ( obj.base->n_index % width  ) * length;
         }
         else if ( data_aff == FLASH_QUEUE_AFFINITY_1D_ROW_BLOCK_CYCLIC )
         { // One-dimensional row block cyclic
            t->queue = obj.base->m_index % n_queues;
         }
         else if ( data_aff == FLASH_QUEUE_AFFINITY_1D_COLUMN_BLOCK_CYCLIC )
         { // One-dimensional column block cyclic
            t->queue = obj.base->n_index % n_queues;
         }
         else
         { // Round-robin
            t->queue = t->queue % n_queues;
         }
      }

      // Determine the height of each task in the DAG.
      height = 0;
      d = t->dep_arg_head;

      // Take the maximum height of dependent tasks.
      for ( j = 0; j < t->n_dep_args; j++ )
      {
         height = max( height, d->task->height );
         d = d->next_dep;
      }

      t->height = height + 1;

      // Since freeing a task is always a leaf, we want to force it to execute 
      // earlier by giving it a greater height in order to reclaim memory.
      if ( t->func == (void *) FLA_Obj_free_buffer_task )
         t->height += n_tasks;

#ifdef FLA_ENABLE_GPU
      for ( j = 0; j < t->n_output_args + t->n_input_args; j++ )
      {
         // Find the correct input or output argument.
         if ( j < t->n_output_args )
            obj = t->output_arg[j];
         else
            obj = t->input_arg[j - t->n_output_args];
         
         // Macroblock is used.
         if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
         {
            dim_t    jj, kk;
            dim_t    m   = FLA_Obj_length( obj );
            dim_t    n   = FLA_Obj_width( obj );
            dim_t    cs  = FLA_Obj_col_stride( obj );
            FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
            
            // Check each block in macroblock.
            for ( jj = 0; jj < n; jj++ )
            {
               for ( kk = 0; kk < m; kk++ )
               {
                  obj = *( buf + jj * cs + kk );

                  block_size = max( FLA_Obj_length( obj ) * FLA_Obj_width( obj ), block_size );

                  if ( jj == 0 && FLA_Obj_datatype( obj ) != datatype && FLA_Obj_datatype_size( FLA_Obj_datatype( obj ) ) > datatype_size )
                  {
                     datatype      = FLA_Obj_datatype( obj );
                     datatype_size = FLA_Obj_datatype_size( datatype );
                  }
               }
            }
         }
         else // Regular block.
         {
            block_size = max( FLA_Obj_length( obj ) * FLA_Obj_width( obj ), block_size );

            if ( FLA_Obj_datatype( obj ) != datatype && FLA_Obj_datatype_size( FLA_Obj_datatype( obj ) ) > datatype_size )
            {
               datatype      = FLA_Obj_datatype( obj );
               datatype_size = FLA_Obj_datatype_size( datatype );
            }
         }
      }
#endif
      
      // Find the first blocks accessed each task.
      if ( n_prefetch < size )
      {
         for ( j = 0; j < t->n_output_args; j++ )
         {
            obj = t->output_arg[j];

            // Macroblock is used.
            if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
            {
               dim_t    jj, kk;
               dim_t    m   = FLA_Obj_length( obj );
               dim_t    n   = FLA_Obj_width( obj );
               dim_t    cs  = FLA_Obj_col_stride( obj );
               FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );

               // Check each block in macroblock.
               for ( jj = 0; jj < n; jj++ )
               {
                  for ( kk = 0; kk < m; kk++ )
                  {
                     obj = *( buf + jj * cs + kk );

                     k = obj.base->n_write_blocks;

                     // This block is one of the first blocks to be accessed.
                     if ( k < size && k == n_prefetch )
                     {
                        args->prefetch[k] = obj;
                        n_prefetch++;
                     }
                  }
               }
            }
            else // Regular block.
            {
               k = obj.base->n_write_blocks;

               // This block is one of the first blocks to be accessed.
               if ( k < size && k == n_prefetch )
               {
                  args->prefetch[k] = obj;
                  n_prefetch++;
               }
            }
         }
      }

      // Find all ready tasks.
      t->n_ready += t->n_input_args + t->n_output_args + 
                    t->n_macro_args + t->n_war_args;

      if ( t->n_ready == 0 )
      {
         // Save the number of ready and available tasks.
         n_ready++;
      }

      // Go to the previous task.
      t = t->prev_task;
   }
   
   // Grab the head of the task queue.
   t = FLASH_Queue_get_head_task();
   
   for ( i = 0; i < n_tasks && n_ready > 0; i++ )
   {
      if ( t->n_ready == 0 )
      {
         // Enqueue all the ready and available tasks.
         FLASH_Queue_wait_enqueue( t, arg );

         // Decrement the number of ready tasks left to be enqueued.
         n_ready--;
      }
      
      // Go to the next task.
      t = t->next_task;
   }

#ifdef FLA_ENABLE_GPU
   args->block_size = block_size;
   args->datatype   = datatype;
#endif

   return;
}

void FLASH_Queue_invalidate_block_gpu	(	FLA_Obj	obj,
		int	thread,
		void *	arg
	)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_gpu(), and FLASH_Queue_update_gpu().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int j, k;
   dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
   FLA_Obj_gpu gpu_obj;

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif

   // Locate the position of the block on the GPU.
   for ( k = 0; k < gpu_n_blocks; k++ )
      if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
         break;
   
   // The block is owned by other GPU.
   if ( k < gpu_n_blocks )
   {
      // Invalidate the block.
      args->gpu[thread * gpu_n_blocks + k].obj.base = NULL;

      args->gpu[thread * gpu_n_blocks + k].clean    = TRUE;
      args->gpu[thread * gpu_n_blocks + k].request  = FALSE;

      // Save the block that will be invalidated.
      gpu_obj = args->gpu[thread * gpu_n_blocks + k];
      
      // Shift all the blocks for the invalidated block.
      for ( j = k; j < gpu_n_blocks - 1; j++ )
         args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j + 1];
      
      // Move to the LRU block.
      args->gpu[thread * gpu_n_blocks + gpu_n_blocks - 1] = gpu_obj;
   }

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif

   return;
}

void FLASH_Queue_mark_gpu	(	FLASH_Task *	t,
		void *	arg
	)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Task_s::n_output_args, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLA_Obj_gpu_struct::request, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec_gpu().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int i, j, k;
   int thread = t->thread;
   dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
   FLA_Bool duplicate;
   FLA_Obj  obj;

   // Mark all the output blocks on the GPU as dirty.
   for ( i = t->n_output_args - 1; i >= 0; i-- )
   {
      obj = t->output_arg[i];

      // Check for duplicate blocks.
      duplicate = FALSE;

      for ( j = 0; j < i && !duplicate; j++ )
      {
         if ( obj.base == t->output_arg[j].base )
            duplicate = TRUE;
      }

      // If the output block has not been processed before.
      if ( !duplicate )
      {
#ifdef FLA_ENABLE_MULTITHREADING
         FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif

         // Locate the position of the block on the GPU.
         for ( k = 0; k < gpu_n_blocks; k++ )
            if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
               break;
         
         if ( k < gpu_n_blocks )
         {
            // Change the bits for the new dirty block.
            args->gpu[thread * gpu_n_blocks + k].clean   = FALSE;
            args->gpu[thread * gpu_n_blocks + k].request = FALSE;
         }

#ifdef FLA_ENABLE_MULTITHREADING
         FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif
      }
   }

   return;
}

void FLASH_Queue_prefetch	(	int	cache,
		void *	arg
	)

References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLASH_Queue_prefetch_block(), FLASH_Queue_variables::prefetch, and FLASH_Queue_variables::size.

Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;   
   int i;
   int size = args->size;
   FLA_Obj obj;

   // Prefetch blocks in opposite order to maintain LRU.
   for ( i = size - 1; i >= 0; i-- )
   {
      obj = args->prefetch[i];

      // Only prefetch if it is a valid block.
      if ( obj.base != NULL )
      {
         // Prefetch the block.
         FLASH_Queue_prefetch_block( obj );
         
         // Record the prefetched block in the cache.
         args->cache[cache * size + i] = obj;
      }
   }

   return;
}

void FLASH_Queue_prefetch_block ( FLA_Obj obj )

References FLA_Obj_datatype(), FLA_Obj_elem_size(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_cache_line_size(), scomplex::real, and dcomplex::real.

Referenced by FLASH_Queue_prefetch().

{
   int          i, inc;
   int          line_size = FLASH_Queue_get_cache_line_size();
   int          elem_size = FLA_Obj_elem_size( obj );
   int          length    = FLA_Obj_length( obj );
   int          width     = FLA_Obj_width( obj );
   FLA_Datatype datatype  = FLA_Obj_datatype( obj );

   // Determine stride to prefetch block into cache.
   inc = line_size / elem_size;

   // Switch between the four different datatypes.
   switch ( datatype )
   {
      case FLA_FLOAT:
      {
         float *buffer = ( float * ) FLA_FLOAT_PTR( obj );
         float access;

         // Access each cache line of the block.
         for ( i = 0; i < length * width; i += inc )
            access = buffer[i];

         // Prevent dead code elimination.
         access += 1.0;

         break;
      }
      case FLA_DOUBLE:
      {
         double *buffer = ( double * ) FLA_DOUBLE_PTR( obj );
         double access;
         
         // Access each cache line of the block.
         for ( i = 0; i < length * width; i += inc )
            access = buffer[i];

         // Prevent dead code elimination.
         access += 1.0;

         break;
      }
      case FLA_COMPLEX:
      {
         scomplex *buffer = ( scomplex * ) FLA_COMPLEX_PTR( obj );
         scomplex access;

         // Access each cache line of the block.
         for ( i = 0; i < length * width; i += inc )
            access = buffer[i];

         // Prevent dead code elimination.
         access.real += 1.0;

         break;
      }
      case FLA_DOUBLE_COMPLEX:
      {
         dcomplex *buffer = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( obj );
         dcomplex access;

         // Access each cache line of the block.
         for ( i = 0; i < length * width; i += inc )
            access = buffer[i];

         // Prevent dead code elimination.
         access.real += 1.0;

         break;
      }
      case FLA_INT:
      {
         int *buffer = ( int * ) FLA_INT_PTR( obj );
         int access;

         // Access each cache line of the block.
         for ( i = 0; i < length * width; i += inc )
            access = buffer[i];

         // Prevent dead code elimination.
         access += 1.0;

         break;
      }
      default:
         // This default case should never execute.
         FLA_Check_error_code( FLA_INVALID_DATATYPE );
   }

   return;
}

void FLASH_Queue_update_block_gpu	(	FLA_Obj	obj,
		void **	buffer_gpu,
		int	thread,
		void *	arg
	)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_write_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.

Referenced by FLASH_Queue_update_gpu().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int j, k;
   dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
   FLA_Bool transfer = FALSE;
   FLA_Bool evict = FALSE;
   FLA_Obj_gpu evict_obj;   
   FLA_Obj_gpu gpu_obj;

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif
   
   // Locate the position of the block on GPU.
   for ( k = 0; k < gpu_n_blocks - 1; k++ )
      if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
         break;

   // Save the pointer to the data on the GPU.
   buffer_gpu[0] = args->gpu[thread * gpu_n_blocks + k].buffer_gpu;

   // Save the victim block.
   evict_obj = args->gpu[thread * gpu_n_blocks + k];

   // The block is not already in the GPU.
   if ( obj.base != args->gpu[thread * gpu_n_blocks + k].obj.base )
   {
      // Save for data transfer outside of critical section.
      transfer = TRUE;

      // Save for eviction outside of critical section.
      if ( evict_obj.obj.base != NULL && !evict_obj.clean )
      {
         evict = TRUE;
         args->victim[thread] = evict_obj;
      }      

      // Save the block in the data structure.
      args->gpu[thread * gpu_n_blocks + k].obj = obj;

      // Make sure the new block is clean.
      args->gpu[thread * gpu_n_blocks + k].clean   = TRUE;
      args->gpu[thread * gpu_n_blocks + k].request = FALSE;
   }       
   
   // Use the block on the GPU that is a hit or LRU.
   gpu_obj = args->gpu[thread * gpu_n_blocks + k];

   // Shift all the previous tasks for LRU replacement.
   for ( j = k; j > 0; j-- )
      args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j - 1];
   
   // Place the block on the cache as the most recently used.
   args->gpu[thread * gpu_n_blocks] = gpu_obj;

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif

   // Evict and flush the LRU dirty block.
   if ( evict )
   {
      FLASH_Queue_read_gpu( evict_obj.obj, evict_obj.buffer_gpu );

#ifdef FLA_ENABLE_MULTITHREADING
      FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif
      
      args->victim[thread].obj.base = NULL;
      
#ifdef FLA_ENABLE_MULTITHREADING
      FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif
   }

   // Move the block to the GPU.
   if ( transfer )
      FLASH_Queue_write_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );

   return;
}

void FLASH_Queue_update_cache	(	FLASH_Task *	t,
		void *	arg
	)

References FLA_Obj_view::base, FLASH_Task_s::cache, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_update_cache_block(), FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, and FLASH_Task_s::output_arg.

Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

{
   int      i, j;
   FLA_Bool duplicate;
   FLA_Obj  obj;

   if ( t == NULL )
      return;
   
   // Updating the input blocks.
   for ( i = t->n_input_args - 1; i >= 0; i-- )
   {
      // Check for duplicate blocks.
      duplicate = FALSE;

      for ( j = 0; j < t->n_output_args && !duplicate; j++ )
      {
     if ( t->input_arg[i].base == t->output_arg[j].base )
        duplicate = TRUE;
      }

      for ( j = 0; j < i && !duplicate; j++ )
      {
     if ( t->input_arg[i].base == t->input_arg[j].base )
        duplicate = TRUE;
      }

      // If the input block has not been processed before.
      if ( !duplicate )
      {
         obj = t->input_arg[i];
         
         // Macroblock is used.
         if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
         {
            dim_t    jj, kk;
            dim_t    m    = FLA_Obj_length( obj );
            dim_t    n    = FLA_Obj_width( obj );
            dim_t    cs   = FLA_Obj_col_stride( obj );
            FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );

            // Dependence analysis for each input block in macroblock.
            for ( jj = 0; jj < n; jj++ )
               for ( kk = 0; kk < m; kk++ )
                  FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ), 
                                                  t->cache, FALSE, arg );
         }
         else // Regular block.
         {
            FLASH_Queue_update_cache_block( obj, t->cache, FALSE, arg );
         }
      }
   }

   // Updating the output blocks.
   for ( i = t->n_output_args - 1; i >= 0; i-- )
   {
      // Check for duplicate blocks.
      duplicate = FALSE;

      for ( j = 0; j < i && !duplicate; j++ )
      {
     if ( t->output_arg[i].base == t->output_arg[j].base )
        duplicate = TRUE;
      }

      // If the output block has not been processed before.
      if ( !duplicate )
      {
         obj = t->output_arg[i];

         // Macroblock is used.
         if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
         {
            dim_t    jj, kk;
            dim_t    m    = FLA_Obj_length( obj );
            dim_t    n    = FLA_Obj_width( obj );
            dim_t    cs   = FLA_Obj_col_stride( obj );
            FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );

            // Dependence analysis for each input block in macroblock.
            for ( jj = 0; jj < n; jj++ )
               for ( kk = 0; kk < m; kk++ )
                  FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ),
                                                  t->cache, TRUE, arg );
         }
         else // Regular block.
         {
            FLASH_Queue_update_cache_block( obj, t->cache, TRUE, arg );
         }
      }
   }   
  
   return;
}

void FLASH_Queue_update_cache_block	(	FLA_Obj	obj,
		int	cache,
		FLA_Bool	output,
		void *	arg
	)

References FLA_Obj_view::base, FLASH_Queue_variables::cac_lock, FLASH_Queue_variables::cache, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_variables::n_caches, and FLASH_Queue_variables::size.

Referenced by FLASH_Queue_update_cache().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int i, j, k;
   int n_caches = args->n_caches;
   int size     = args->size;

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->cac_lock[cache]) ); // C ***      
#endif

   // Locate the position of the block in the cache.
   for ( k = 0; k < size - 1; k++ )
   {
      if ( obj.base == args->cache[cache * size + k].base )
     break;
   }

   // Shift all the previous tasks for LRU replacement.
   for ( j = k; j > 0; j-- )
      args->cache[cache * size + j] = args->cache[cache * size + j - 1];

   // Place the block on the cache as the most recently used.
   args->cache[cache * size] = obj;

#ifdef FLA_ENABLE_MULTITHREADING      
   FLA_Lock_release( &(args->cac_lock[cache]) ); // C ***
#endif

   // Write invalidate if updating with output block.
   if ( output )
   {
      for ( i = 0; i < n_caches; i++ )
      {
         if ( i != cache )
         {
#ifdef FLA_ENABLE_MULTITHREADING
        FLA_Lock_acquire( &(args->cac_lock[i]) ); // C ***      
#endif
        // Locate the position of the block in the cache.
        for ( k = 0; k < size; k++ )
        {
           if ( obj.base == args->cache[i * size + k].base )
          break;
        }

        // The block is owned by other thread.
        if ( k < size )
        {
           // Shift all the blocks for the invalidated block.
           for ( j = k; j < size - 1; j++ )
          args->cache[i * size + j] = args->cache[i * size + j + 1];

           // Invalidate the block.
           args->cache[i * size + size - 1].base = NULL;
        }
#ifdef FLA_ENABLE_MULTITHREADING      
        FLA_Lock_release( &(args->cac_lock[i]) ); // C ***
#endif
         }
      }
   }

   return;
}

void FLASH_Queue_update_gpu	(	FLASH_Task *	t,
		void **	input_arg,
		void **	output_arg,
		void *	arg
	)

References FLA_Obj_view::base, FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_update_block_gpu(), FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::output_arg, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec_gpu().

{
   int i, j, k;
   int thread    = t->thread;
   int n_threads = FLASH_Queue_get_num_threads();
   FLA_Bool duplicate;

   // None of the arguments can be macroblocks yet.
   // Complicating factor is copying macroblock to contiguous memory on GPU.
   
   // Bring the input arguments to the GPU.
   for ( i = t->n_input_args - 1; i >= 0; i-- )
   {
      // Check for duplicate blocks.
      duplicate = FALSE;

      for ( j = 0; j < t->n_output_args && !duplicate; j++ )
      {
         if ( t->input_arg[i].base == t->output_arg[j].base )
            duplicate = TRUE;
      }

      for ( j = 0; j < i && !duplicate; j++ )
      {
         if ( t->input_arg[i].base == t->input_arg[j].base )
            duplicate = TRUE;
      }

      // If the input block has not been processed before.
      if ( !duplicate )
      {
         FLASH_Queue_update_block_gpu( t->input_arg[i], input_arg + i, thread, arg );
      }
      else
      {
         input_arg[i] = NULL;
      }
   }

   // Bring the output arguments to the GPU.
   for ( i = t->n_output_args - 1; i >= 0; i-- )
   {
      // Check for duplicate blocks.
      duplicate = FALSE;

      for ( j = 0; j < i && !duplicate; j++ )
      {
         if ( t->output_arg[i].base == t->output_arg[j].base )
            duplicate = TRUE;
      }

      // If the output block has not been processed before.
      if ( !duplicate )
      {
         FLASH_Queue_update_block_gpu( t->output_arg[i], output_arg + i, thread, arg );                                       
         
         // Invalidate output blocks on all other GPUs.
         for ( k = 0; k < n_threads; k++ )
            if ( k != thread )
               FLASH_Queue_invalidate_block_gpu( t->output_arg[i], k, arg );
      }
      else
      {
         output_arg[i] = NULL;
      }
   }

   // Check to see if there are any duplicates.
   for ( i = t->n_input_args - 1; i >= 0; i-- )
   {
      for ( j = 0; j < t->n_output_args && input_arg[i] == NULL; j++ )
      {
         if ( t->input_arg[i].base == t->output_arg[j].base )
            input_arg[i] = output_arg[j];
      }

      for ( j = 0; j < i && input_arg[i] == NULL; j++ )
      {
         if ( t->input_arg[i].base == t->input_arg[j].base )
            input_arg[i] = input_arg[j];
      }
   }

   // Check to see if there are any duplicates.
   for ( i = t->n_output_args - 1; i >= 0; i-- )
   {
      for ( j = 0; j < i && output_arg[i] == NULL; j++ )
      {
         if ( t->output_arg[i].base == t->output_arg[j].base )
            output_arg[i] = output_arg[j];
      }
   }

   return;
}

FLASH_Task* FLASH_Queue_wait_dequeue	(	int	queue,
		int	cache,
		void *	arg
	)

References FLASH_Queue_variables::cac_lock, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_caching(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_wait_dequeue_block(), FLASH_Queue_variables::gpu_lock, FLASH_Queue_s::head, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLASH_Task_s::next_wait, FLASH_Queue_variables::pc, FLASH_Task_s::prev_wait, FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), and FLASH_Task_update_dependencies().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   FLASH_Task* t = NULL;
   FLA_Bool enabled = FALSE;

#ifdef FLA_ENABLE_GPU
   enabled = FLASH_Queue_get_enabled_gpu();
#endif

   if ( args->wait_queue[queue].n_tasks > 0 )
   {
      // Dequeue the first task.
      t = args->wait_queue[queue].head;

      if ( args->wait_queue[queue].n_tasks == 1 )
      {
         // Clear the queue of its only task.
         args->wait_queue[queue].head = NULL;
         args->wait_queue[queue].tail = NULL;        
      }
      else
      {
         // Grab a new task if using cache affinity.
     if ( FLASH_Queue_get_caching() )
     {
            // Determine if using GPU or not.
            if ( enabled )
            {
#ifdef FLA_ENABLE_GPU
#ifdef FLA_ENABLE_MULTITHREADING
               FLA_Lock_acquire( &(args->gpu_lock[cache]) ); // G ***      
#endif
               // Find a task where the task has blocks currently in GPU.
               t = FLASH_Queue_wait_dequeue_block( queue, cache, arg );
               
#ifdef FLA_ENABLE_MULTITHREADING      
               FLA_Lock_release( &(args->gpu_lock[cache]) ); // G ***
#endif
#endif
            }
            else
            {
#ifdef FLA_ENABLE_MULTITHREADING
               FLA_Lock_acquire( &(args->cac_lock[cache]) ); // C ***      
#endif
               // Find a task where the task has blocks currently in cache.
               t = FLASH_Queue_wait_dequeue_block( queue, cache, arg );
               
#ifdef FLA_ENABLE_MULTITHREADING      
               FLA_Lock_release( &(args->cac_lock[cache]) ); // C ***
#endif
            }

        // Adjust pointers if the task is head of waiting queue.
        if ( t->prev_wait == NULL )
        {
           args->wait_queue[queue].head = t->next_wait;
           args->wait_queue[queue].head->prev_wait = NULL;
        }
        else
        {
           t->prev_wait->next_wait = t->next_wait;
        }
        
        // Adjust pointers if the task is tail of waiting queue.
        if ( t->next_wait == NULL )
        {
           args->wait_queue[queue].tail = t->prev_wait;
           args->wait_queue[queue].tail->next_wait = NULL;
        }
        else
        {
           t->next_wait->prev_wait = t->prev_wait;
        }
     }
     else
     {
        // Adjust pointers in waiting queue.
        args->wait_queue[queue].head = t->next_wait;
        args->wait_queue[queue].head->prev_wait = NULL;
     }
      }

      // Clear the task's waiting linked list pointers.
      t->prev_wait = NULL;
      t->next_wait = NULL;

      // Decrement number of tasks on waiting queue.
      args->wait_queue[queue].n_tasks--;     
   }

   return t;
}

FLASH_Task* FLASH_Queue_wait_dequeue_block	(	int	queue,
		int	cache,
		void *	arg
	)

References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLA_Obj_elemtype(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_s::head, FLASH_Task_s::hit, FLASH_Task_s::n_output_args, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_wait, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLASH_Queue_variables::size, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_wait_dequeue().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int         i, j, k;
   int         size    = args->size;
   int         n_tasks = args->wait_queue[queue].n_tasks;
   FLA_Bool    enabled = FALSE;
   FLASH_Task* t;
   FLA_Obj     obj;
   FLA_Obj     mem;

#ifdef FLA_ENABLE_GPU
   enabled = FLASH_Queue_get_enabled_gpu();

   // If using GPUs, then only check GPU and not the cache.
   if ( enabled )
      size = FLASH_Queue_get_gpu_num_blocks();
#endif

   t = args->wait_queue[queue].head;
   
   // Check if any of the output blocks are in the cache.
   for ( i = 0; i < n_tasks; i++ )
   {
      for ( j = 0; j < size; j++ )
      {
         // Initialize the memory just in case.
         mem.base = NULL;
         
         // Determine if using GPU or not.
         if ( enabled )
         {
#ifdef FLA_ENABLE_GPU
            mem = args->gpu[cache * size + j].obj;
#endif
         }
         else
         {
            mem = args->cache[cache * size + j];
         }

     for ( k = 0; k < t->n_output_args; k++ )
     {  
            obj = t->output_arg[k];

            if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
               obj = *FLASH_OBJ_PTR_AT( obj );

            // Return the task if its output block is in cache.
            if ( mem.base == obj.base )
            {
               t->hit = TRUE;
               return t;
            }
     }
      }
      t = t->next_wait;
   }
   
   return args->wait_queue[queue].head;
}

void FLASH_Queue_wait_enqueue	(	FLASH_Task *	t,
		void *	arg
	)

References FLASH_Queue_get_sorting(), FLASH_Queue_s::head, FLASH_Task_s::height, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLASH_Task_s::next_wait, FLASH_Task_s::order, FLASH_Queue_variables::pc, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_exec_gpu(), FLASH_Queue_exec_simulation(), FLASH_Queue_init_tasks(), FLASH_Task_update_binding(), and FLASH_Task_update_dependencies().

{  
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int queue = t->queue;

   if ( args->wait_queue[queue].n_tasks == 0 )
   {
      args->wait_queue[queue].head = t;
      args->wait_queue[queue].tail = t;
   }
   else
   {
      t->prev_wait = args->wait_queue[queue].tail;

      // Insertion sort of tasks in waiting queue.
      if ( FLASH_Queue_get_sorting() )
      {
         while ( t->prev_wait != NULL )
         {
            if ( t->prev_wait->height >= t->height )
               break;
            
            t->next_wait = t->prev_wait;
            t->prev_wait = t->prev_wait->prev_wait;
         }         
      }

      // Checking if the task is the head of the waiting queue.      
      if ( t->prev_wait == NULL )
         args->wait_queue[queue].head = t;
      else
         t->prev_wait->next_wait = t;

      // Checking if the task is the tail of the waiting queue.
      if ( t->next_wait == NULL )
         args->wait_queue[queue].tail = t;
      else
         t->next_wait->prev_wait = t;
   }

   // Increment number of tasks on waiting queue.
   args->wait_queue[queue].n_tasks++;

   return;
}

FLASH_Task* FLASH_Queue_work_stealing	(	int	queue,
		void *	arg
	)

References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_s::head, FLASH_Queue_variables::n_queues, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_wait, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, FLASH_Queue_variables::run_lock, FLASH_Queue_s::tail, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_exec_parallel_function().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int         q;
   int         n_queues = args->n_queues;
   FLASH_Task* t = NULL;

   // Do not perform work stealing if there is only one queue.
   if ( n_queues == 1 )
      return t;

   // Find a random queue not equal to the current queue.
   do
   {
#ifdef FLA_ENABLE_WINDOWS_BUILD
      rand_s( &q );
      q = q % n_queues;
#else
#ifdef FLA_ENABLE_TIDSP
      q = rand() % n_queues;
#else
      q = lrand48() % n_queues;
#endif
#endif
   }
   while ( q == queue );

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->run_lock[q]) ); // R ***
#endif

   // If there are tasks that this thread can steal.
   if ( args->wait_queue[q].n_tasks > 0 )
   {
      // Dequeue the last task.
      t = args->wait_queue[q].tail;

      if ( args->wait_queue[q].n_tasks == 1 )
      {
         // Clear the queue of its only task.
         args->wait_queue[q].head = NULL;
         args->wait_queue[q].tail = NULL;
      }
      else
      {
         // Adjust pointers in waiting queue.
         args->wait_queue[q].tail = t->prev_wait;
         args->wait_queue[q].tail->next_wait = NULL;
      }

      // Reset waiting queue data about the stolen task.
      t->queue = queue;
      t->prev_wait = NULL;
      t->next_wait = NULL;
      
      args->wait_queue[q].n_tasks--;
   }

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_release( &(args->run_lock[q]) ); // R ***
#endif

   return t;
}

void FLASH_Task_free_parallel	(	FLASH_Task *	t,
		void *	arg
	)

References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::fla_arg, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_num_threads(), FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Dep_s::next_dep, FLASH_Task_s::output_arg, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Queue_variables::war_lock, and FLA_Obj_struct::write_task.

Referenced by FLASH_Queue_exec_parallel_function().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;   
   int        i, j, k;
   int        thread;
   int        n_threads = FLASH_Queue_get_num_threads();
   FLASH_Dep* d;
   FLASH_Dep* next_dep;
   FLA_Obj    obj;

   // Clearing the last write task in each output block.
   for ( i = 0; i < t->n_output_args; i++ )
   {
      obj = t->output_arg[i];
      
      // Macroblock is used.
      if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
      {
         dim_t    jj, kk;
         dim_t    m    = FLA_Obj_length( obj );
         dim_t    n    = FLA_Obj_width( obj );
         dim_t    cs   = FLA_Obj_col_stride( obj );
         FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
         
         // Clear each block in macroblock.
         for ( jj = 0; jj < n; jj++ )
            for ( kk = 0; kk < m; kk++ )
               ( buf + jj * cs + kk )->base->write_task = NULL;
      }
      else // Clear regular block.
      {
         obj.base->write_task = NULL;
      }
   }
   
   // Cleaning the last read tasks in each input block.
   for ( i = 0; i < t->n_input_args; i++ )
   {
      obj = t->input_arg[i];

      // Macroblock is used.
      if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
      {
         dim_t    jj, kk;
         dim_t    m    = FLA_Obj_length( obj );
         dim_t    n    = FLA_Obj_width( obj );
         dim_t    cs   = FLA_Obj_col_stride( obj );
         FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );

         // Clear each block in macroblock.
         for ( jj = 0; jj < n; jj++ )
         {
            for ( kk = 0; kk < m; kk++ )
            {
               obj = *( buf + jj * cs + kk );

               thread = obj.base->n_read_blocks % n_threads;

               FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***

               k = obj.base->n_read_tasks;
               d = obj.base->read_task_head;

               obj.base->n_read_tasks   = 0;
               obj.base->read_task_head = NULL;
               obj.base->read_task_tail = NULL;

               FLA_Lock_release( &(args->war_lock[thread]) ); // W ***

               for ( j = 0; j < k; j++ )
               {
                  next_dep = d->next_dep;
                  FLA_free( d );
                  d = next_dep;
               }
            }
         }
      }
      else // Regular block.
      {
         thread = obj.base->n_read_blocks % n_threads;

         FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***

         k = obj.base->n_read_tasks;
         d = obj.base->read_task_head;

         obj.base->n_read_tasks   = 0;
         obj.base->read_task_head = NULL;
         obj.base->read_task_tail = NULL;

         FLA_Lock_release( &(args->war_lock[thread]) ); // W ***

         for ( j = 0; j < k; j++ )
         {
            next_dep = d->next_dep;
            FLA_free( d );
            d = next_dep;
         }
      }
   }
   
   // Free the dep_arg field of t.
   d = t->dep_arg_head;

   for ( i = 0; i < t->n_dep_args; i++ )
   {
      next_dep = d->next_dep;
      FLA_free( d );
      d = next_dep;
   }

   // Free the int_arg field of t.
   FLA_free( t->int_arg );
   
   // Free the fla_arg field of t.
   FLA_free( t->fla_arg );

   // Free the input_arg field of t.
   FLA_free( t->input_arg );

   // Free the output_arg field of t.
   FLA_free( t->output_arg );

   // Finally, free the struct itself.
   FLA_free( t );

   return;
}

FLASH_Task* FLASH_Task_update_binding	(	FLASH_Task *	t,
		FLASH_Task *	r,
		void *	arg
	)

References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_sorting(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::height, FLASH_Task_s::hit, FLASH_Task_s::queue, and FLASH_Queue_variables::run_lock.

Referenced by FLASH_Task_update_dependencies().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int queue;

   if ( r == NULL )
   {
      // There are no tasks on waiting queue, so bind the first task.
      r = t;
      r->hit = TRUE;
   }
   else
   {
      // Swap the binded task for the new ready task.
      if ( !r->hit || ( FLASH_Queue_get_sorting() && r->height < t->height ) )
      {
         queue = r->queue;
         r->hit = FALSE;
         
         FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***

         // Place swapped task back onto waiting queue.
         FLASH_Queue_wait_enqueue( r, arg );
         
         FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
         
         // Bind the new ready task.
         r = t;
         r->hit = TRUE;
      }
      else // Keep the binded task and enqueue new ready task.
      {
         queue = t->queue;
         
         FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
         
         FLASH_Queue_wait_enqueue( t, arg );
         
         FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
      }
   }

   return r;
}

FLASH_Task* FLASH_Task_update_dependencies	(	FLASH_Task *	t,
		void *	arg
	)

References FLASH_Task_s::cache, FLASH_Task_s::dep_arg_head, FLASH_Queue_variables::dep_lock, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_caching(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_update_binding(), FLASH_Task_s::n_dep_args, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Dep_s::next_dep, FLASH_Task_s::order, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::run_lock, and FLASH_Dep_s::task.

Referenced by FLASH_Queue_exec_parallel_function().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int         i;
   int         q = t->queue;
   int         queue;
   int         thread;
   int         n_threads = FLASH_Queue_get_num_threads();
   FLA_Bool    caching   = FLASH_Queue_get_caching();
   FLA_Bool    stealing  = FLASH_Queue_get_work_stealing();
   FLA_Bool    available;
   FLASH_Task* task;
   FLASH_Task* r = NULL;
   FLASH_Dep*  d = t->dep_arg_head;
   
   // Dequeue task to bind to thread if caching is enabled.
   if ( caching )
   {
      FLA_Lock_acquire( &(args->run_lock[q]) ); // R ***

      // Obtain task to execute.
      r = FLASH_Queue_wait_dequeue( q, t->cache, arg );

      FLA_Lock_release( &(args->run_lock[q]) ); // R ***
   }

   // Check each dependent task.
   for ( i = 0; i < t->n_dep_args; i++ )
   {
      if ( stealing )
      {
         // Place all dependent tasks onto same queue as predecessor task.
         d->task->queue = q;
      }

      task   = d->task;
      queue  = task->queue;
      thread = task->order % n_threads;

      FLA_Lock_acquire( &(args->dep_lock[thread]) ); // D ***
      
      task->n_ready--;
      available = ( task->n_ready == 0 );
      
      FLA_Lock_release( &(args->dep_lock[thread]) ); // D ***

      // Place newly ready tasks on waiting queue.      
      if ( available )
      {
         // If caching is enabled and the task belongs to this thread's queue.
         if ( caching && q == queue )
         {
            // Determine if there is a new binded task.
            r = FLASH_Task_update_binding( task, r, arg );
         }
         else
         {
            FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
            
            FLASH_Queue_wait_enqueue( task, arg );
            
            FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
         }
      }
      
      // Go to the next dep.
      d = d->next_dep;
   }

   return r;
}

int RCCE_acquire_lock ( int )

Referenced by FLASH_Queue_exec_parallel_function(), FLASH_Queue_init_tasks(), and FLASH_Task_update_dependencies().

int RCCE_release_lock ( int )

Referenced by FLASH_Queue_exec_parallel_function(), FLASH_Queue_init_tasks(), and FLASH_Task_update_dependencies().

int RCCE_ue ( void )

double RCCE_wtime ( void )

Referenced by FLASH_Queue_exec().

void Synch_all ( )

Referenced by FLASH_Queue_exec().

(r12600)

Data Structures

Typedefs

Functions

Typedef Documentation

Function Documentation