libflame
12600
|
typedef struct FLA_Obj_gpu_struct FLA_Obj_gpu |
typedef struct FLASH_Queue_variables FLASH_Queue_vars |
FLA_Bool FLASH_Queue_check_block_gpu | ( | FLA_Obj | obj, |
int | thread, | ||
void * | arg | ||
) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.
Referenced by FLASH_Queue_check_gpu().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int k; dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks(); FLA_Bool r_val = TRUE; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif // Locate the position of the block on the GPU. for ( k = 0; k < gpu_n_blocks; k++ ) if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base ) break; if ( k < gpu_n_blocks ) { // Request this block if it is dirty. if ( !args->gpu[thread * gpu_n_blocks + k].clean ) { args->gpu[thread * gpu_n_blocks + k].request = TRUE; r_val = FALSE; } } // Check the victim block. if ( obj.base == args->victim[thread].obj.base ) r_val = FALSE; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif return r_val; }
FLA_Bool FLASH_Queue_check_gpu | ( | FLASH_Task * | t, |
void * | arg | ||
) |
References FLA_Obj_view::base, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_block_gpu(), FLASH_Queue_get_num_threads(), FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::output_arg, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec_gpu().
{ int i, j, k; int thread = t->thread; int n_input_args = t->n_input_args; int n_output_args = t->n_output_args; int n_threads = FLASH_Queue_get_num_threads(); FLA_Bool r_val = TRUE; FLA_Bool t_val; FLA_Bool duplicate; FLA_Obj obj; // Check the input and output arguments on the GPUs. for ( i = 0; i < n_input_args + n_output_args; i++ ) { // Check for duplicate blocks. duplicate = FALSE; // Find the correct input or output argument. if ( i < n_input_args ) { obj = t->input_arg[i]; for ( j = 0; j < n_output_args && !duplicate; j++ ) { if ( obj.base == t->output_arg[j].base ) duplicate = TRUE; } for ( j = 0; j < i && !duplicate; j++ ) { if ( obj.base == t->input_arg[j].base ) duplicate = TRUE; } } else { obj = t->output_arg[i - n_input_args]; for ( j = 0; j < i - n_input_args && !duplicate; j++ ) { if ( obj.base == t->output_arg[j].base ) duplicate = TRUE; } } // If the block has not been processed before. if ( !duplicate ) { // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Clear each block in macroblock. for ( jj = 0; jj < n; jj++ ) { for ( kk = 0; kk < m; kk++ ) { obj = *( buf + jj * cs + kk ); t_val = TRUE; // Check to see if the block is dirty on another GPU. for ( k = 0; k < n_threads && t_val; k++ ) if ( k != thread ) t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg ); r_val = r_val && t_val; } } } else { t_val = TRUE; // Check to see if the block is dirty on another GPU. for ( k = 0; k < n_threads && t_val; k++ ) if ( k != thread ) t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg ); r_val = r_val && t_val; } } } return r_val; }
void FLASH_Queue_create_gpu | ( | int | thread, |
void * | arg | ||
) |
References FLASH_Queue_variables::block_size, FLA_Obj_gpu_struct::buffer_gpu, FLASH_Queue_variables::datatype, FLASH_Queue_alloc_gpu(), FLASH_Queue_bind_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), and FLASH_Queue_variables::gpu.
Referenced by FLASH_Queue_exec_parallel_function().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i; dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks(); dim_t block_size = args->block_size; FLA_Datatype datatype = args->datatype; // Exit if not using GPU. if ( !FLASH_Queue_get_enabled_gpu() ) return; // Bind thread to GPU. FLASH_Queue_bind_gpu( thread ); // Allocate the memory on the GPU for all the blocks a priori. for ( i = 0; i < gpu_n_blocks; i++ ) FLASH_Queue_alloc_gpu( block_size, datatype, &(args->gpu[thread * gpu_n_blocks + i].buffer_gpu) ); return; }
void FLASH_Queue_destroy_gpu | ( | int | thread, |
void * | arg | ||
) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLASH_Queue_free_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, and FLA_Obj_gpu_struct::obj.
Referenced by FLASH_Queue_exec_parallel_function().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i; dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks(); FLA_Obj_gpu gpu_obj; // Exit if not using GPU. if ( !FLASH_Queue_get_enabled_gpu() ) return; // Examine every block left on the GPU. for ( i = 0; i < gpu_n_blocks; i++ ) { gpu_obj = args->gpu[thread * gpu_n_blocks + i]; // Flush the blocks that are dirty. if ( gpu_obj.obj.base != NULL && !gpu_obj.clean ) FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu ); // Free the memory on the GPU for all the blocks. FLASH_Queue_free_gpu( gpu_obj.buffer_gpu ); } return; }
void FLASH_Queue_exec | ( | void | ) |
References FLASH_Queue_variables::all_lock, FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLASH_Queue_variables::cac_lock, FLASH_Queue_variables::cache, FLA_Obj_gpu_struct::clean, FLASH_Queue_variables::dep_lock, FLA_Clock(), FLA_free(), FLA_is_owner(), FLA_Lock_destroy(), FLA_Lock_init(), FLA_malloc(), FLA_shfree(), FLA_shmalloc(), FLASH_Queue_exec_parallel(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), FLASH_Queue_get_block_size(), FLASH_Queue_get_cache_size(), FLASH_Queue_get_caching(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_cores_per_queue(), FLASH_Queue_get_data_affinity(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLASH_Queue_get_work_stealing(), FLASH_Queue_init_tasks(), FLASH_Queue_reset(), FLASH_Queue_set_caching(), FLASH_Queue_set_data_affinity(), FLASH_Queue_set_parallel_time(), FLASH_Queue_set_work_stealing(), FLASH_Queue_verbose_output(), FLASH_Task_free(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Queue_variables::gpu_log, FLASH_Queue_s::head, FLASH_Queue_variables::n_caches, FLASH_Queue_variables::n_queues, FLASH_Queue_variables::n_ready, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLA_Obj_gpu_struct::obj, FLASH_Queue_variables::pc, FLASH_Queue_variables::prefetch, RCCE_wtime(), FLA_Obj_gpu_struct::request, FLASH_Queue_variables::run_lock, FLASH_Queue_variables::size, Synch_all(), FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, FLASH_Queue_variables::victim, FLASH_Queue_variables::wait_queue, and FLASH_Queue_variables::war_lock.
Referenced by FLASH_Queue_end().
{ int n_tasks = FLASH_Queue_get_num_tasks(); int n_threads = FLASH_Queue_get_num_threads(); int n_queues; int n_caches; int size; int i; dim_t block_size = FLASH_Queue_get_block_size(); double dtime; FLA_Lock* run_lock; FLA_Lock* dep_lock; FLA_Lock* war_lock; FLA_Lock* cac_lock; FLA_Obj* cache; FLA_Obj* prefetch; FLASH_Queue* wait_queue; #ifdef FLA_ENABLE_GPU #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock* gpu_lock; #endif FLA_Obj_gpu* gpu; FLA_Obj_gpu* victim; FLA_Obj_gpu* gpu_log; dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks(); #endif // All the necessary variables for the SuperMatrix mechanism. FLASH_Queue_vars args; // If the queue is empty, return early. if ( n_tasks == 0 ) return; #ifndef FLA_ENABLE_MULTITHREADING // Turn off work stealing in simulation mode. FLASH_Queue_set_work_stealing( FALSE ); #endif // Query the number of user set threads per queue. n_queues = FLASH_Queue_get_cores_per_queue(); // Default user setting for number of threads. if ( n_queues <= 0 ) { // Do not use data affinity or work stealing when caching is enabled. if ( FLASH_Queue_get_caching() ) { FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_NONE ); FLASH_Queue_set_work_stealing( FALSE ); } // Do not use work stealing when data affinity is enabled. if ( FLASH_Queue_get_data_affinity() != FLASH_QUEUE_AFFINITY_NONE ) { FLASH_Queue_set_work_stealing( FALSE ); } // Allocate different arrays if using data affinity. n_queues = ( FLASH_Queue_get_data_affinity() == FLASH_QUEUE_AFFINITY_NONE && !FLASH_Queue_get_work_stealing() ? 1 : n_threads ); } else { // Set the number of queues. n_queues = n_threads / n_queues; // Must use at least one queue. if ( n_queues == 0 ) n_queues = 1; if ( n_queues == 1 ) { // Turn off all multiple queue implementations. FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_NONE ); FLASH_Queue_set_work_stealing( FALSE ); } else { // Use 2D data affinity for multiple queues if nothing is set. if ( FLASH_Queue_get_data_affinity() == FLASH_QUEUE_AFFINITY_NONE && !FLASH_Queue_get_work_stealing() ) { FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC ); } } } // Determine the number of caches. n_caches = n_threads / FLASH_Queue_get_cores_per_cache(); args.n_queues = n_queues; args.n_caches = n_caches; #ifdef FLA_ENABLE_MULTITHREADING // Allocate memory for array of locks. run_lock = ( FLA_Lock* ) FLA_malloc( n_queues * sizeof( FLA_Lock ) ); dep_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) ); war_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) ); cac_lock = ( FLA_Lock* ) FLA_malloc( n_caches * sizeof( FLA_Lock ) ); args.run_lock = run_lock; args.dep_lock = dep_lock; args.war_lock = war_lock; args.cac_lock = cac_lock; // Initialize the all lock. FLA_Lock_init( &(args.all_lock) ); // Initialize the run lock for thread i. for ( i = 0; i < n_queues; i++ ) { FLA_Lock_init( &(args.run_lock[i]) ); } // Initialize the dep and war locks for thread i. for ( i = 0; i < n_threads; i++ ) { FLA_Lock_init( &(args.dep_lock[i]) ); FLA_Lock_init( &(args.war_lock[i]) ); } // Initialize the cac locks for each cache. for ( i = 0; i < n_caches; i++ ) { FLA_Lock_init( &(args.cac_lock[i]) ); } #endif // The number of blocks that can fit into the cache on each thread. if ( block_size == 0 ) size = MIN_CACHE_BLOCKS; else size = max( FLASH_Queue_get_cache_size() / block_size, MIN_CACHE_BLOCKS); args.size = size; // Allocate memory for cache, prefetch buffer, and waiting queue. cache = ( FLA_Obj* ) FLA_malloc( size * n_caches * sizeof( FLA_Obj ) ); prefetch = ( FLA_Obj* ) FLA_malloc( size * sizeof( FLA_Obj ) ); wait_queue = ( FLASH_Queue* ) FLA_malloc( n_queues * sizeof( FLASH_Queue )); args.cache = cache; args.prefetch = prefetch; args.wait_queue = wait_queue; // Initialize cache, prefetch buffer, and waiting queue. for ( i = 0; i < size * n_caches; i++ ) args.cache[i].base = NULL; for ( i = 0; i < size; i++ ) args.prefetch[i].base = NULL; for ( i = 0; i < n_queues; i++ ) { args.wait_queue[i].n_tasks = 0; args.wait_queue[i].head = NULL; args.wait_queue[i].tail = NULL; } // Initialize the aggregate task counter. args.pc = 0; #ifdef FLA_ENABLE_GPU #ifdef FLA_ENABLE_MULTITHREADING // Allocate and initialize the gpu locks. gpu_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) ); args.gpu_lock = gpu_lock; for ( i = 0; i < n_threads; i++ ) FLA_Lock_init( &(args.gpu_lock[i]) ); #endif // Allocate and initialize GPU software cache. gpu = ( FLA_Obj_gpu* ) FLA_malloc( gpu_n_blocks * n_threads * sizeof( FLA_Obj_gpu ) ); args.gpu = gpu; for ( i = 0; i < gpu_n_blocks * n_threads; i++ ) { args.gpu[i].obj.base = NULL; args.gpu[i].buffer_gpu = NULL; args.gpu[i].clean = TRUE; args.gpu[i].request = FALSE; } victim = ( FLA_Obj_gpu* ) FLA_malloc( n_threads * sizeof( FLA_Obj_gpu ) ); args.victim = victim; for ( i = 0; i < n_threads; i++ ) args.victim[i].obj.base = NULL; gpu_log = ( FLA_Obj_gpu* ) FLA_malloc( gpu_n_blocks * n_threads * sizeof( FLA_Obj_gpu ) ); args.gpu_log = gpu_log; #endif // Initialize tasks with critical information. FLASH_Queue_init_tasks( ( void* ) &args ); // Display verbose output before free all tasks. if ( FLASH_Queue_get_verbose_output() ) FLASH_Queue_verbose_output(); // Start timing the parallel execution. dtime = FLA_Clock(); #ifdef FLA_ENABLE_MULTITHREADING // Parallel Execution! FLASH_Queue_exec_parallel( ( void* ) &args ); #else // Simulation! FLASH_Queue_exec_simulation( ( void* ) &args ); #endif // End timing the parallel execution. dtime = FLA_Clock() - dtime; FLASH_Queue_set_parallel_time( dtime ); #ifdef FLA_ENABLE_MULTITHREADING // Destroy the locks. FLA_Lock_destroy( &(args.all_lock) ); for ( i = 0; i < n_queues; i++ ) { FLA_Lock_destroy( &(args.run_lock[i]) ); } for ( i = 0; i < n_threads; i++ ) { FLA_Lock_destroy( &(args.dep_lock[i]) ); FLA_Lock_destroy( &(args.war_lock[i]) ); } for ( i = 0; i < n_caches; i++ ) { FLA_Lock_destroy( &(args.cac_lock[i]) ); } // Deallocate memory. FLA_free( run_lock ); FLA_free( dep_lock ); FLA_free( war_lock ); FLA_free( cac_lock ); #endif FLA_free( cache ); FLA_free( prefetch ); FLA_free( wait_queue ); #ifdef FLA_ENABLE_GPU #ifdef FLA_ENABLE_MULTITHREADING for ( i = 0; i < n_threads; i++ ) FLA_Lock_destroy( &(args.gpu_lock[i]) ); FLA_free( gpu_lock ); #endif FLA_free( gpu ); FLA_free( victim ); FLA_free( gpu_log ); #endif // Reset values for next call to FLASH_Queue_exec(). FLASH_Queue_reset(); return; }
FLA_Bool FLASH_Queue_exec_gpu | ( | FLASH_Task * | t, |
void * | arg | ||
) |
References FLA_Obj_view::base, FLASH_Task_s::enabled_gpu, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_malloc(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_exec_task_gpu(), FLASH_Queue_flush_block_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_mark_gpu(), FLASH_Queue_update_gpu(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::hit, FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::output_arg, FLASH_Task_s::queue, FLASH_Queue_variables::run_lock, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec_parallel_function().
{ void** input_arg; void** output_arg; if ( t == NULL ) return TRUE; // If not using the GPU, then execute on CPU. if ( !FLASH_Queue_get_enabled_gpu() ) { FLASH_Queue_exec_task( t ); return TRUE; } // Check if all the operands are ready and up to date. if ( !FLASH_Queue_check_gpu( t, arg ) ) { FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int queue = t->queue; t->hit = FALSE; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->run_lock[queue]) ); // R *** #endif // Reenqueue the task if the blocks are not all flushed. FLASH_Queue_wait_enqueue( t, arg ); #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->run_lock[queue]) ); // R *** #endif return FALSE; } // If GPU is enabled, but the task is not supported for GPU execution. if ( !t->enabled_gpu ) { int i, j, k; int thread = t->thread; int n_input_args = t->n_input_args; int n_output_args = t->n_output_args; int n_threads = FLASH_Queue_get_num_threads(); FLA_Bool duplicate; FLA_Obj obj; // Check the blocks on each GPU. for ( k = 0; k < n_threads; k++ ) { // Check the input and output arguments on the GPUs. for ( i = 0; i < n_input_args + n_output_args; i++ ) { // Check for duplicate blocks. duplicate = FALSE; // Find the correct input or output argument. if ( i < n_input_args ) { obj = t->input_arg[i]; for ( j = 0; j < n_output_args && !duplicate; j++ ) { if ( obj.base == t->output_arg[j].base ) duplicate = TRUE; } for ( j = 0; j < i && !duplicate; j++ ) { if ( obj.base == t->input_arg[j].base ) duplicate = TRUE; } } else { obj = t->output_arg[i - n_input_args]; for ( j = 0; j < i - n_input_args && !duplicate; j++ ) { if ( obj.base == t->output_arg[j].base ) duplicate = TRUE; } } // If the block has not been processed before. if ( !duplicate ) { // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Clear each block in macroblock. for ( jj = 0; jj < n; jj++ ) { for ( kk = 0; kk < m; kk++ ) { obj = *( buf + jj * cs + kk ); // Flush the block to main memory if it is on the GPU. if ( k == thread ) FLASH_Queue_flush_block_gpu( obj, k, arg ); // Invalidate output block on all GPUs. if ( i >= n_input_args ) FLASH_Queue_invalidate_block_gpu( obj, k, arg ); } } } else { // Flush the block to main memory if it is on the GPU. if ( k == thread ) FLASH_Queue_flush_block_gpu( obj, k, arg ); // Invalidate output block on all GPUs. if ( i >= n_input_args ) FLASH_Queue_invalidate_block_gpu( obj, k, arg ); } } } } // Execute the task on CPU instead of GPU. FLASH_Queue_exec_task( t ); return TRUE; } // Gather the pointers for the data on the GPU. input_arg = ( void** ) FLA_malloc( t->n_input_args * sizeof( void* ) ); output_arg = ( void** ) FLA_malloc( t->n_output_args * sizeof( void* ) ); // Bring all the blocks to GPU. FLASH_Queue_update_gpu( t, input_arg, output_arg, arg ); // Execute the task on GPU. FLASH_Queue_exec_task_gpu( t, input_arg, output_arg ); // Mark all the output blocks as dirty. FLASH_Queue_mark_gpu( t, arg ); // Free memory. FLA_free( input_arg ); FLA_free( output_arg ); return TRUE; }
void FLASH_Queue_exec_parallel | ( | void * | arg | ) |
References FLA_Check_pthread_create_result(), FLA_Check_pthread_join_result(), FLA_free(), FLA_malloc(), FLASH_Queue_exec_parallel_function(), and FLASH_Queue_get_num_threads().
Referenced by FLASH_Queue_exec().
{ int i; int n_threads = FLASH_Queue_get_num_threads(); void* (*thread_entry_point)( void* ); // Allocate the thread structures array. Here, an array of FLASH_Thread // structures of length n_threads is allocated and the fields of each // structure set to appropriate values. FLASH_Thread* thread = ( FLASH_Thread* ) FLA_malloc( n_threads * sizeof( FLASH_Thread ) ); // Initialize the thread structures array. for ( i = 0; i < n_threads; i++ ) { // Save the thread's identifier. thread[i].id = i; // Save the pointer to the necessary variables with the thread. thread[i].args = arg; // The pthread object, if it was even compiled into the FLASH_Thread // structure, will be initialized by the pthread implementation when we // call pthread_create() and does not need to be touched at this time. } // Determine which function to send threads to. thread_entry_point = FLASH_Queue_exec_parallel_function; #if FLA_MULTITHREADING_MODEL == FLA_OPENMP // An OpenMP parallel for region spawns n_threads threads. Each thread // executes the work function with a different FLASH_Thread argument. // An implicit synchronization point exists at the end of the curly // brace scope. #pragma omp parallel for \ private( i ) \ shared( thread, n_threads, thread_entry_point ) \ schedule( static, 1 ) \ num_threads( n_threads ) for ( i = 0; i < n_threads; ++i ) { thread_entry_point( ( void* ) &thread[i] ); } #elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS // Create each POSIX thread needed in addition to the main thread. for ( i = 1; i < n_threads; i++ ) { int pthread_e_val; // Create thread i with default attributes. pthread_e_val = pthread_create( &(thread[i].pthread_obj), NULL, thread_entry_point, ( void* ) &thread[i] ); #ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING FLA_Error e_val = FLA_Check_pthread_create_result( pthread_e_val ); FLA_Check_error_code( e_val ); #endif } // The main thread is assigned the role of thread 0. Here we manually // execute it as a worker thread. thread_entry_point( ( void* ) &thread[0] ); // Wait for non-main threads to finish. for ( i = 1; i < n_threads; i++ ) { // These two variables are declared local to this for loop since this // is the only place they are needed, and since they would show up as // unused variables if FLA_MULTITHREADING_MODEL == FLA_PTHREADS. // Strangely, the Intel compiler produces code that results in an // "unaligned access" runtime message if thread_status is declared as // an int. Declaring it as a long or void* appears to force the // compiler (not surprisingly) into aligning it to an 8-byte boundary. int pthread_e_val; void* thread_status; // Wait for thread i to invoke its respective pthread_exit(). // The return value passed to pthread_exit() is provided to us // via status, if one was given. pthread_e_val = pthread_join( thread[i].pthread_obj, ( void** ) &thread_status ); #ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING FLA_Error e_val = FLA_Check_pthread_join_result( pthread_e_val ); FLA_Check_error_code( e_val ); #endif } #endif FLA_free( thread ); return; }
void* FLASH_Queue_exec_parallel_function | ( | void * | arg | ) |
References FLASH_Thread_s::args, FLASH_Task_s::cache, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_create_gpu(), FLASH_Queue_destroy_gpu(), FLASH_Queue_exec_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_flush_gpu(), FLASH_Queue_get_caching(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_work_stealing(), FLASH_Task_free_parallel(), FLASH_Task_update_dependencies(), FLASH_Thread_s::id, FLASH_Queue_variables::n_queues, RCCE_acquire_lock(), RCCE_release_lock(), RCCE_ue(), and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec(), and FLASH_Queue_exec_parallel().
{ FLASH_Queue_vars* args; int i; int queue; int cache; int n_tasks = FLASH_Queue_get_num_tasks(); int n_threads = FLASH_Queue_get_num_threads(); int n_cores = FLASH_Queue_get_cores_per_cache(); FLA_Bool caching = FLASH_Queue_get_caching(); FLA_Bool stealing = FLASH_Queue_get_work_stealing(); FLA_Bool committed = TRUE; FLA_Bool condition = TRUE; FLA_Bool enabled = FALSE; FLA_Bool available; FLASH_Task* t = NULL; FLASH_Task* r = NULL; FLASH_Thread* me; //cpu_set_t cpu_set; // Interpret the thread argument as what it really is--a pointer to an // FLASH_Thread structure. me = ( FLASH_Thread* ) arg; // Extract the variables from the current thread. args = ( FLASH_Queue_vars* ) me->args; // Figure out the id of the current thread. i = me->id; // Set the CPU affinity; We want the current thread i to run only on CPU i. //CPU_ZERO( &cpu_set ); //CPU_SET( i, &cpu_set ); //sched_setaffinity( syscall( __NR_gettid ), sizeof(cpu_set_t), &cpu_set ); // Determine to which queue this thread belongs. queue = i / ( n_threads / args->n_queues ); // Determine to which cache this thread belongs. cache = i / n_cores; #ifdef FLA_ENABLE_GPU // Create memory on GPU. FLASH_Queue_create_gpu( i, ( void* ) args ); // Save whether GPUs are enabled. enabled = FLASH_Queue_get_enabled_gpu(); // Only use each GPU as its own cache when GPUs are enabled. if ( enabled ) cache = i; #endif // Prefetch blocks into the cache before execution. if ( caching && !enabled && i % n_cores == 0 ) FLASH_Queue_prefetch( cache, ( void* ) args ); // Loop until all the tasks have committed. while ( condition ) { #ifdef FLA_ENABLE_GPU // Check to see if any blocks on GPU need to be flushed. FLASH_Queue_flush_gpu( i, ( void* ) args ); #endif // Dequeue a task if there has not been one binded to thread. if ( r == NULL ) { FLA_Lock_acquire( &(args->run_lock[queue]) ); // R *** // Obtain task to execute. t = FLASH_Queue_wait_dequeue( queue, cache, ( void* ) args ); FLA_Lock_release( &(args->run_lock[queue]) ); // R *** } else { // Obtain the binded task. t = r; r = NULL; } // Dequeued a task from the waiting queue. available = ( t != NULL ); if ( available ) { // Save the thread and cache that executes the task. t->thread = i; t->cache = cache; if ( caching && !enabled ) { // Update the current state of the cache. FLASH_Queue_update_cache( t, ( void* ) args ); } #ifdef FLA_ENABLE_GPU // Execute the task on GPU. committed = FLASH_Queue_exec_gpu( t, ( void* ) args ); #else // Execute the task. FLASH_Queue_exec_task( t ); #endif // If the task has executed or not. if ( committed ) { // Update task dependencies. r = FLASH_Task_update_dependencies( t, ( void* ) args ); // Free the task once it executes in parallel. FLASH_Task_free_parallel( t, ( void* ) args ); } } else { if ( stealing ) { // Perform work stealing if there are no tasks to dequeue. r = FLASH_Queue_work_stealing( queue, ( void* ) args ); } } FLA_Lock_acquire( &(args->all_lock) ); // A *** // Increment program counter. if ( available && committed ) args->pc++; // Terminate loop. if ( args->pc >= n_tasks ) condition = FALSE; FLA_Lock_release( &(args->all_lock) ); // A *** } #ifdef FLA_ENABLE_GPU // Destroy and flush contents of GPU back to main memory. FLASH_Queue_destroy_gpu( i, ( void* ) args ); #endif #if FLA_MULTITHREADING_MODEL == FLA_PTHREADS // If this is a non-main thread, then exit with a zero (normal) error code. // The main thread cannot call pthread_exit() because this routine never // returns. The main thread must proceed so it can oversee the joining of // the exited non-main pthreads. if ( i != 0 ) pthread_exit( ( void* ) NULL ); #endif return ( void* ) NULL; }
void FLASH_Queue_exec_simulation | ( | void * | arg | ) |
References FLASH_Task_s::cache, FLASH_Task_s::dep_arg_head, FLA_free(), FLA_malloc(), FLASH_Queue_exec_task(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_free(), FLASH_Task_s::n_dep_args, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Task_s::name, FLASH_Dep_s::next_dep, FLASH_Queue_variables::pc, FLASH_Dep_s::task, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i, j; int queue; int cache; int n_stages = 0; int n_queues = args->n_queues; int n_tasks = FLASH_Queue_get_num_tasks(); int n_threads = FLASH_Queue_get_num_threads(); int n_cores = FLASH_Queue_get_cores_per_cache(); FLASH_Verbose verbose = FLASH_Queue_get_verbose_output(); FLASH_Task* task; FLASH_Task* t; FLASH_Dep* d; // An array to hold tasks to be executed during of simulation. #ifdef FLA_ENABLE_WINDOWS_BUILD FLASH_Task** exec_array = ( FLASH_Task** ) FLA_malloc( n_threads * sizeof( FLASH_Task* ) ); #else FLASH_Task* exec_array[n_threads]; #endif for ( i = 0; i < n_threads; i++ ) { // Initialize all exec_array to NULL. exec_array[i] = NULL; // Prefetch blocks into the cache before execution. if ( i % n_cores == 0 ) FLASH_Queue_prefetch( i, arg ); } // Loop until all the tasks have committed. while ( args->pc < n_tasks ) { for ( i = 0; i < n_threads; i++ ) { // Update waiting queue with ready tasks. t = exec_array[i]; if ( t != NULL ) { // Check each dependent task. d = t->dep_arg_head; for ( j = 0; j < t->n_dep_args; j++ ) { task = d->task; task->n_ready--; // Place newly ready tasks on waiting queue. if ( task->n_ready == 0 ) { FLASH_Queue_wait_enqueue( task, arg ); } // Go to the next dep. d = d->next_dep; } // Free the task. FLASH_Task_free( t ); } } n_stages++; if ( !verbose ) printf( "%7d", n_stages ); // Move ready tasks from the waiting queue to execution queue. for ( i = 0; i < n_threads; i++ ) { // Determine to which queue this thread belongs. queue = i / ( n_threads / n_queues ); // Determine to which cache this thread belongs. cache = i / n_cores; // Dequeue a task. t = FLASH_Queue_wait_dequeue( queue, cache, arg ); // Save the task for execution. exec_array[i] = t; if ( t != NULL ) { // Save the thread and cache that executes the task. t->thread = i; t->cache = cache; // Increment program counter. args->pc++; } } // Execute independent tasks. for ( i = 0; i < n_threads; i++ ) { t = exec_array[i]; FLASH_Queue_update_cache( t, arg ); FLASH_Queue_exec_task( t ); if ( !verbose ) printf( "%7s", ( t == NULL ? " " : t->name ) ); // Free the task if this is the last stage. if ( args->pc == n_tasks && t != NULL ) FLASH_Task_free( t ); } if ( !verbose ) printf( "\n" ); } if ( !verbose ) printf( "\n" ); #ifdef FLA_ENABLE_WINDOWS_BUILD FLA_free( exec_array ); #endif return; }
void FLASH_Queue_flush_block_gpu | ( | FLA_Obj | obj, |
int | thread, | ||
void * | arg | ||
) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.
Referenced by FLASH_Queue_exec_gpu().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int k; dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks(); FLA_Bool transfer = FALSE; FLA_Obj_gpu gpu_obj; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif // Locate the position of the block on the GPU. for ( k = 0; k < gpu_n_blocks; k++ ) if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base ) break; // The block is owned by the GPU. if ( k < gpu_n_blocks ) { // Save the block that will be flushed. gpu_obj = args->gpu[thread * gpu_n_blocks + k]; // If the block is dirty, then flush it. if ( gpu_obj.obj.base != NULL && !gpu_obj.clean ) transfer = TRUE; } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif // Exit early if a flush is not required. if ( !transfer ) return; // Flush the block outside the critical section. FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu ); #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif // Locate the position of the block on the GPU. for ( k = 0; k < gpu_n_blocks; k++ ) if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base ) break; if ( k < gpu_n_blocks ) { // Update the bits for the flushed block. args->gpu[thread * gpu_n_blocks + k].clean = TRUE; args->gpu[thread * gpu_n_blocks + k].request = FALSE; } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif return; }
void FLASH_Queue_flush_gpu | ( | int | thread, |
void * | arg | ||
) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Queue_variables::gpu_log, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.
Referenced by FLASH_Queue_exec_parallel_function().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i, k; dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks(); int n_transfer = 0; FLA_Obj_gpu gpu_obj; // Exit if not using GPU. if ( !FLASH_Queue_get_enabled_gpu() ) return; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif for ( k = 0; k < gpu_n_blocks; k++ ) { // Save the block that might be flushed. gpu_obj = args->gpu[thread * gpu_n_blocks + k]; // Flush the block if it is dirty and requested. if ( gpu_obj.obj.base != NULL && !gpu_obj.clean && gpu_obj.request ) { // Save the block for data transfer outside the critical section. args->gpu_log[thread * gpu_n_blocks + n_transfer] = gpu_obj; n_transfer++; } } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif // Exit early if a flush is not required. if ( n_transfer == 0 ) return; // Flush the block outside the critical section. for ( i = 0; i < n_transfer; i++ ) { gpu_obj = args->gpu_log[thread * gpu_n_blocks + i]; FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu ); } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif // Update the bits for each block that is flushed. for ( i = 0; i < n_transfer; i++ ) { // Locate the position of the block on the GPU. for ( k = 0; k < gpu_n_blocks; k++ ) if ( args->gpu_log[thread * gpu_n_blocks + i].obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base ) break; if ( k < gpu_n_blocks ) { // The block is now clean. args->gpu[thread * gpu_n_blocks + k].clean = TRUE; args->gpu[thread * gpu_n_blocks + k].request = FALSE; } } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif return; }
void FLASH_Queue_init_tasks | ( | void * | arg | ) |
References FLA_Obj_view::base, FLASH_Queue_variables::block_size, FLASH_Queue_variables::datatype, FLASH_Task_s::dep_arg_head, FLA_is_owner(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_datatype_size(), FLA_Obj_elemtype(), FLA_Obj_free_buffer_task(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_data_affinity(), FLASH_Queue_get_head_task(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_tail_task(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::func, FLASH_Task_s::height, FLASH_Task_s::input_arg, FLA_Obj_struct::m_index, FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_index, FLASH_Task_s::n_input_args, FLASH_Task_s::n_macro_args, FLASH_Task_s::n_output_args, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Task_s::n_war_args, FLA_Obj_struct::n_write_blocks, FLASH_Dep_s::next_dep, FLASH_Task_s::next_task, FLASH_Task_s::output_arg, FLASH_Queue_variables::prefetch, FLASH_Task_s::prev_task, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::size, FLASH_Dep_s::task, and FLASH_Queue_variables::task_queue.
Referenced by FLASH_Queue_exec().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i, j, k; int n_tasks = FLASH_Queue_get_num_tasks(); int n_queues = args->n_queues; int n_prefetch = 0; int n_ready = 0; int length = 0; int width = 0; int height = 0; int size = args->size; FLASH_Data_aff data_aff = FLASH_Queue_get_data_affinity(); FLASH_Task* t; FLASH_Dep* d; FLA_Obj obj; #ifdef FLA_ENABLE_GPU dim_t block_size = 0; FLA_Datatype datatype = FLA_FLOAT; dim_t datatype_size = FLA_Obj_datatype_size( datatype ); #endif // Find the 2D factorization of the number of threads. if ( data_aff == FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC ) { int sq_rt = 0; while ( sq_rt * sq_rt <= n_queues ) sq_rt++; sq_rt--; while ( n_queues % sq_rt != 0 ) sq_rt--; length = n_queues / sq_rt; width = sq_rt; } // Grab the tail of the task queue. t = FLASH_Queue_get_tail_task(); for ( i = n_tasks - 1; i >= 0; i-- ) { // Determine data affinity. if ( data_aff == FLASH_QUEUE_AFFINITY_NONE ) { // No data affinity t->queue = 0; } else { // Use the first output block to determine data affinity. obj = t->output_arg[0]; // Use the top left block of the macroblock. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) obj = *FLASH_OBJ_PTR_AT( obj ); if ( data_aff == FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC ) { // Two-dimensional block cyclic t->queue = ( obj.base->m_index % length ) + ( obj.base->n_index % width ) * length; } else if ( data_aff == FLASH_QUEUE_AFFINITY_1D_ROW_BLOCK_CYCLIC ) { // One-dimensional row block cyclic t->queue = obj.base->m_index % n_queues; } else if ( data_aff == FLASH_QUEUE_AFFINITY_1D_COLUMN_BLOCK_CYCLIC ) { // One-dimensional column block cyclic t->queue = obj.base->n_index % n_queues; } else { // Round-robin t->queue = t->queue % n_queues; } } // Determine the height of each task in the DAG. height = 0; d = t->dep_arg_head; // Take the maximum height of dependent tasks. for ( j = 0; j < t->n_dep_args; j++ ) { height = max( height, d->task->height ); d = d->next_dep; } t->height = height + 1; // Since freeing a task is always a leaf, we want to force it to execute // earlier by giving it a greater height in order to reclaim memory. if ( t->func == (void *) FLA_Obj_free_buffer_task ) t->height += n_tasks; #ifdef FLA_ENABLE_GPU for ( j = 0; j < t->n_output_args + t->n_input_args; j++ ) { // Find the correct input or output argument. if ( j < t->n_output_args ) obj = t->output_arg[j]; else obj = t->input_arg[j - t->n_output_args]; // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Check each block in macroblock. for ( jj = 0; jj < n; jj++ ) { for ( kk = 0; kk < m; kk++ ) { obj = *( buf + jj * cs + kk ); block_size = max( FLA_Obj_length( obj ) * FLA_Obj_width( obj ), block_size ); if ( jj == 0 && FLA_Obj_datatype( obj ) != datatype && FLA_Obj_datatype_size( FLA_Obj_datatype( obj ) ) > datatype_size ) { datatype = FLA_Obj_datatype( obj ); datatype_size = FLA_Obj_datatype_size( datatype ); } } } } else // Regular block. { block_size = max( FLA_Obj_length( obj ) * FLA_Obj_width( obj ), block_size ); if ( FLA_Obj_datatype( obj ) != datatype && FLA_Obj_datatype_size( FLA_Obj_datatype( obj ) ) > datatype_size ) { datatype = FLA_Obj_datatype( obj ); datatype_size = FLA_Obj_datatype_size( datatype ); } } } #endif // Find the first blocks accessed each task. if ( n_prefetch < size ) { for ( j = 0; j < t->n_output_args; j++ ) { obj = t->output_arg[j]; // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Check each block in macroblock. for ( jj = 0; jj < n; jj++ ) { for ( kk = 0; kk < m; kk++ ) { obj = *( buf + jj * cs + kk ); k = obj.base->n_write_blocks; // This block is one of the first blocks to be accessed. if ( k < size && k == n_prefetch ) { args->prefetch[k] = obj; n_prefetch++; } } } } else // Regular block. { k = obj.base->n_write_blocks; // This block is one of the first blocks to be accessed. if ( k < size && k == n_prefetch ) { args->prefetch[k] = obj; n_prefetch++; } } } } // Find all ready tasks. t->n_ready += t->n_input_args + t->n_output_args + t->n_macro_args + t->n_war_args; if ( t->n_ready == 0 ) { // Save the number of ready and available tasks. n_ready++; } // Go to the previous task. t = t->prev_task; } // Grab the head of the task queue. t = FLASH_Queue_get_head_task(); for ( i = 0; i < n_tasks && n_ready > 0; i++ ) { if ( t->n_ready == 0 ) { // Enqueue all the ready and available tasks. FLASH_Queue_wait_enqueue( t, arg ); // Decrement the number of ready tasks left to be enqueued. n_ready--; } // Go to the next task. t = t->next_task; } #ifdef FLA_ENABLE_GPU args->block_size = block_size; args->datatype = datatype; #endif return; }
void FLASH_Queue_invalidate_block_gpu | ( | FLA_Obj | obj, |
int | thread, | ||
void * | arg | ||
) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.
Referenced by FLASH_Queue_exec_gpu(), and FLASH_Queue_update_gpu().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int j, k; dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks(); FLA_Obj_gpu gpu_obj; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif // Locate the position of the block on the GPU. for ( k = 0; k < gpu_n_blocks; k++ ) if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base ) break; // The block is owned by other GPU. if ( k < gpu_n_blocks ) { // Invalidate the block. args->gpu[thread * gpu_n_blocks + k].obj.base = NULL; args->gpu[thread * gpu_n_blocks + k].clean = TRUE; args->gpu[thread * gpu_n_blocks + k].request = FALSE; // Save the block that will be invalidated. gpu_obj = args->gpu[thread * gpu_n_blocks + k]; // Shift all the blocks for the invalidated block. for ( j = k; j < gpu_n_blocks - 1; j++ ) args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j + 1]; // Move to the LRU block. args->gpu[thread * gpu_n_blocks + gpu_n_blocks - 1] = gpu_obj; } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif return; }
void FLASH_Queue_mark_gpu | ( | FLASH_Task * | t, |
void * | arg | ||
) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Task_s::n_output_args, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLA_Obj_gpu_struct::request, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec_gpu().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i, j, k; int thread = t->thread; dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks(); FLA_Bool duplicate; FLA_Obj obj; // Mark all the output blocks on the GPU as dirty. for ( i = t->n_output_args - 1; i >= 0; i-- ) { obj = t->output_arg[i]; // Check for duplicate blocks. duplicate = FALSE; for ( j = 0; j < i && !duplicate; j++ ) { if ( obj.base == t->output_arg[j].base ) duplicate = TRUE; } // If the output block has not been processed before. if ( !duplicate ) { #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif // Locate the position of the block on the GPU. for ( k = 0; k < gpu_n_blocks; k++ ) if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base ) break; if ( k < gpu_n_blocks ) { // Change the bits for the new dirty block. args->gpu[thread * gpu_n_blocks + k].clean = FALSE; args->gpu[thread * gpu_n_blocks + k].request = FALSE; } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif } } return; }
void FLASH_Queue_prefetch | ( | int | cache, |
void * | arg | ||
) |
References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLASH_Queue_prefetch_block(), FLASH_Queue_variables::prefetch, and FLASH_Queue_variables::size.
Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i; int size = args->size; FLA_Obj obj; // Prefetch blocks in opposite order to maintain LRU. for ( i = size - 1; i >= 0; i-- ) { obj = args->prefetch[i]; // Only prefetch if it is a valid block. if ( obj.base != NULL ) { // Prefetch the block. FLASH_Queue_prefetch_block( obj ); // Record the prefetched block in the cache. args->cache[cache * size + i] = obj; } } return; }
void FLASH_Queue_prefetch_block | ( | FLA_Obj | obj | ) |
References FLA_Obj_datatype(), FLA_Obj_elem_size(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_cache_line_size(), scomplex::real, and dcomplex::real.
Referenced by FLASH_Queue_prefetch().
{ int i, inc; int line_size = FLASH_Queue_get_cache_line_size(); int elem_size = FLA_Obj_elem_size( obj ); int length = FLA_Obj_length( obj ); int width = FLA_Obj_width( obj ); FLA_Datatype datatype = FLA_Obj_datatype( obj ); // Determine stride to prefetch block into cache. inc = line_size / elem_size; // Switch between the four different datatypes. switch ( datatype ) { case FLA_FLOAT: { float *buffer = ( float * ) FLA_FLOAT_PTR( obj ); float access; // Access each cache line of the block. for ( i = 0; i < length * width; i += inc ) access = buffer[i]; // Prevent dead code elimination. access += 1.0; break; } case FLA_DOUBLE: { double *buffer = ( double * ) FLA_DOUBLE_PTR( obj ); double access; // Access each cache line of the block. for ( i = 0; i < length * width; i += inc ) access = buffer[i]; // Prevent dead code elimination. access += 1.0; break; } case FLA_COMPLEX: { scomplex *buffer = ( scomplex * ) FLA_COMPLEX_PTR( obj ); scomplex access; // Access each cache line of the block. for ( i = 0; i < length * width; i += inc ) access = buffer[i]; // Prevent dead code elimination. access.real += 1.0; break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buffer = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( obj ); dcomplex access; // Access each cache line of the block. for ( i = 0; i < length * width; i += inc ) access = buffer[i]; // Prevent dead code elimination. access.real += 1.0; break; } case FLA_INT: { int *buffer = ( int * ) FLA_INT_PTR( obj ); int access; // Access each cache line of the block. for ( i = 0; i < length * width; i += inc ) access = buffer[i]; // Prevent dead code elimination. access += 1.0; break; } default: // This default case should never execute. FLA_Check_error_code( FLA_INVALID_DATATYPE ); } return; }
void FLASH_Queue_update_block_gpu | ( | FLA_Obj | obj, |
void ** | buffer_gpu, | ||
int | thread, | ||
void * | arg | ||
) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_write_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.
Referenced by FLASH_Queue_update_gpu().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int j, k; dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks(); FLA_Bool transfer = FALSE; FLA_Bool evict = FALSE; FLA_Obj_gpu evict_obj; FLA_Obj_gpu gpu_obj; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif // Locate the position of the block on GPU. for ( k = 0; k < gpu_n_blocks - 1; k++ ) if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base ) break; // Save the pointer to the data on the GPU. buffer_gpu[0] = args->gpu[thread * gpu_n_blocks + k].buffer_gpu; // Save the victim block. evict_obj = args->gpu[thread * gpu_n_blocks + k]; // The block is not already in the GPU. if ( obj.base != args->gpu[thread * gpu_n_blocks + k].obj.base ) { // Save for data transfer outside of critical section. transfer = TRUE; // Save for eviction outside of critical section. if ( evict_obj.obj.base != NULL && !evict_obj.clean ) { evict = TRUE; args->victim[thread] = evict_obj; } // Save the block in the data structure. args->gpu[thread * gpu_n_blocks + k].obj = obj; // Make sure the new block is clean. args->gpu[thread * gpu_n_blocks + k].clean = TRUE; args->gpu[thread * gpu_n_blocks + k].request = FALSE; } // Use the block on the GPU that is a hit or LRU. gpu_obj = args->gpu[thread * gpu_n_blocks + k]; // Shift all the previous tasks for LRU replacement. for ( j = k; j > 0; j-- ) args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j - 1]; // Place the block on the cache as the most recently used. args->gpu[thread * gpu_n_blocks] = gpu_obj; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif // Evict and flush the LRU dirty block. if ( evict ) { FLASH_Queue_read_gpu( evict_obj.obj, evict_obj.buffer_gpu ); #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif args->victim[thread].obj.base = NULL; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif } // Move the block to the GPU. if ( transfer ) FLASH_Queue_write_gpu( gpu_obj.obj, gpu_obj.buffer_gpu ); return; }
void FLASH_Queue_update_cache | ( | FLASH_Task * | t, |
void * | arg | ||
) |
References FLA_Obj_view::base, FLASH_Task_s::cache, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_update_cache_block(), FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, and FLASH_Task_s::output_arg.
Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().
{ int i, j; FLA_Bool duplicate; FLA_Obj obj; if ( t == NULL ) return; // Updating the input blocks. for ( i = t->n_input_args - 1; i >= 0; i-- ) { // Check for duplicate blocks. duplicate = FALSE; for ( j = 0; j < t->n_output_args && !duplicate; j++ ) { if ( t->input_arg[i].base == t->output_arg[j].base ) duplicate = TRUE; } for ( j = 0; j < i && !duplicate; j++ ) { if ( t->input_arg[i].base == t->input_arg[j].base ) duplicate = TRUE; } // If the input block has not been processed before. if ( !duplicate ) { obj = t->input_arg[i]; // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Dependence analysis for each input block in macroblock. for ( jj = 0; jj < n; jj++ ) for ( kk = 0; kk < m; kk++ ) FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ), t->cache, FALSE, arg ); } else // Regular block. { FLASH_Queue_update_cache_block( obj, t->cache, FALSE, arg ); } } } // Updating the output blocks. for ( i = t->n_output_args - 1; i >= 0; i-- ) { // Check for duplicate blocks. duplicate = FALSE; for ( j = 0; j < i && !duplicate; j++ ) { if ( t->output_arg[i].base == t->output_arg[j].base ) duplicate = TRUE; } // If the output block has not been processed before. if ( !duplicate ) { obj = t->output_arg[i]; // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Dependence analysis for each input block in macroblock. for ( jj = 0; jj < n; jj++ ) for ( kk = 0; kk < m; kk++ ) FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ), t->cache, TRUE, arg ); } else // Regular block. { FLASH_Queue_update_cache_block( obj, t->cache, TRUE, arg ); } } } return; }
void FLASH_Queue_update_cache_block | ( | FLA_Obj | obj, |
int | cache, | ||
FLA_Bool | output, | ||
void * | arg | ||
) |
References FLA_Obj_view::base, FLASH_Queue_variables::cac_lock, FLASH_Queue_variables::cache, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_variables::n_caches, and FLASH_Queue_variables::size.
Referenced by FLASH_Queue_update_cache().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i, j, k; int n_caches = args->n_caches; int size = args->size; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->cac_lock[cache]) ); // C *** #endif // Locate the position of the block in the cache. for ( k = 0; k < size - 1; k++ ) { if ( obj.base == args->cache[cache * size + k].base ) break; } // Shift all the previous tasks for LRU replacement. for ( j = k; j > 0; j-- ) args->cache[cache * size + j] = args->cache[cache * size + j - 1]; // Place the block on the cache as the most recently used. args->cache[cache * size] = obj; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->cac_lock[cache]) ); // C *** #endif // Write invalidate if updating with output block. if ( output ) { for ( i = 0; i < n_caches; i++ ) { if ( i != cache ) { #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->cac_lock[i]) ); // C *** #endif // Locate the position of the block in the cache. for ( k = 0; k < size; k++ ) { if ( obj.base == args->cache[i * size + k].base ) break; } // The block is owned by other thread. if ( k < size ) { // Shift all the blocks for the invalidated block. for ( j = k; j < size - 1; j++ ) args->cache[i * size + j] = args->cache[i * size + j + 1]; // Invalidate the block. args->cache[i * size + size - 1].base = NULL; } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->cac_lock[i]) ); // C *** #endif } } } return; }
void FLASH_Queue_update_gpu | ( | FLASH_Task * | t, |
void ** | input_arg, | ||
void ** | output_arg, | ||
void * | arg | ||
) |
References FLA_Obj_view::base, FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_update_block_gpu(), FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::output_arg, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec_gpu().
{ int i, j, k; int thread = t->thread; int n_threads = FLASH_Queue_get_num_threads(); FLA_Bool duplicate; // None of the arguments can be macroblocks yet. // Complicating factor is copying macroblock to contiguous memory on GPU. // Bring the input arguments to the GPU. for ( i = t->n_input_args - 1; i >= 0; i-- ) { // Check for duplicate blocks. duplicate = FALSE; for ( j = 0; j < t->n_output_args && !duplicate; j++ ) { if ( t->input_arg[i].base == t->output_arg[j].base ) duplicate = TRUE; } for ( j = 0; j < i && !duplicate; j++ ) { if ( t->input_arg[i].base == t->input_arg[j].base ) duplicate = TRUE; } // If the input block has not been processed before. if ( !duplicate ) { FLASH_Queue_update_block_gpu( t->input_arg[i], input_arg + i, thread, arg ); } else { input_arg[i] = NULL; } } // Bring the output arguments to the GPU. for ( i = t->n_output_args - 1; i >= 0; i-- ) { // Check for duplicate blocks. duplicate = FALSE; for ( j = 0; j < i && !duplicate; j++ ) { if ( t->output_arg[i].base == t->output_arg[j].base ) duplicate = TRUE; } // If the output block has not been processed before. if ( !duplicate ) { FLASH_Queue_update_block_gpu( t->output_arg[i], output_arg + i, thread, arg ); // Invalidate output blocks on all other GPUs. for ( k = 0; k < n_threads; k++ ) if ( k != thread ) FLASH_Queue_invalidate_block_gpu( t->output_arg[i], k, arg ); } else { output_arg[i] = NULL; } } // Check to see if there are any duplicates. for ( i = t->n_input_args - 1; i >= 0; i-- ) { for ( j = 0; j < t->n_output_args && input_arg[i] == NULL; j++ ) { if ( t->input_arg[i].base == t->output_arg[j].base ) input_arg[i] = output_arg[j]; } for ( j = 0; j < i && input_arg[i] == NULL; j++ ) { if ( t->input_arg[i].base == t->input_arg[j].base ) input_arg[i] = input_arg[j]; } } // Check to see if there are any duplicates. for ( i = t->n_output_args - 1; i >= 0; i-- ) { for ( j = 0; j < i && output_arg[i] == NULL; j++ ) { if ( t->output_arg[i].base == t->output_arg[j].base ) output_arg[i] = output_arg[j]; } } return; }
FLASH_Task* FLASH_Queue_wait_dequeue | ( | int | queue, |
int | cache, | ||
void * | arg | ||
) |
References FLASH_Queue_variables::cac_lock, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_caching(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_wait_dequeue_block(), FLASH_Queue_variables::gpu_lock, FLASH_Queue_s::head, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLASH_Task_s::next_wait, FLASH_Queue_variables::pc, FLASH_Task_s::prev_wait, FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, and FLASH_Queue_variables::wait_queue.
Referenced by FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), and FLASH_Task_update_dependencies().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; FLASH_Task* t = NULL; FLA_Bool enabled = FALSE; #ifdef FLA_ENABLE_GPU enabled = FLASH_Queue_get_enabled_gpu(); #endif if ( args->wait_queue[queue].n_tasks > 0 ) { // Dequeue the first task. t = args->wait_queue[queue].head; if ( args->wait_queue[queue].n_tasks == 1 ) { // Clear the queue of its only task. args->wait_queue[queue].head = NULL; args->wait_queue[queue].tail = NULL; } else { // Grab a new task if using cache affinity. if ( FLASH_Queue_get_caching() ) { // Determine if using GPU or not. if ( enabled ) { #ifdef FLA_ENABLE_GPU #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[cache]) ); // G *** #endif // Find a task where the task has blocks currently in GPU. t = FLASH_Queue_wait_dequeue_block( queue, cache, arg ); #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[cache]) ); // G *** #endif #endif } else { #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->cac_lock[cache]) ); // C *** #endif // Find a task where the task has blocks currently in cache. t = FLASH_Queue_wait_dequeue_block( queue, cache, arg ); #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->cac_lock[cache]) ); // C *** #endif } // Adjust pointers if the task is head of waiting queue. if ( t->prev_wait == NULL ) { args->wait_queue[queue].head = t->next_wait; args->wait_queue[queue].head->prev_wait = NULL; } else { t->prev_wait->next_wait = t->next_wait; } // Adjust pointers if the task is tail of waiting queue. if ( t->next_wait == NULL ) { args->wait_queue[queue].tail = t->prev_wait; args->wait_queue[queue].tail->next_wait = NULL; } else { t->next_wait->prev_wait = t->prev_wait; } } else { // Adjust pointers in waiting queue. args->wait_queue[queue].head = t->next_wait; args->wait_queue[queue].head->prev_wait = NULL; } } // Clear the task's waiting linked list pointers. t->prev_wait = NULL; t->next_wait = NULL; // Decrement number of tasks on waiting queue. args->wait_queue[queue].n_tasks--; } return t; }
FLASH_Task* FLASH_Queue_wait_dequeue_block | ( | int | queue, |
int | cache, | ||
void * | arg | ||
) |
References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLA_Obj_elemtype(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_s::head, FLASH_Task_s::hit, FLASH_Task_s::n_output_args, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_wait, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLASH_Queue_variables::size, and FLASH_Queue_variables::wait_queue.
Referenced by FLASH_Queue_wait_dequeue().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i, j, k; int size = args->size; int n_tasks = args->wait_queue[queue].n_tasks; FLA_Bool enabled = FALSE; FLASH_Task* t; FLA_Obj obj; FLA_Obj mem; #ifdef FLA_ENABLE_GPU enabled = FLASH_Queue_get_enabled_gpu(); // If using GPUs, then only check GPU and not the cache. if ( enabled ) size = FLASH_Queue_get_gpu_num_blocks(); #endif t = args->wait_queue[queue].head; // Check if any of the output blocks are in the cache. for ( i = 0; i < n_tasks; i++ ) { for ( j = 0; j < size; j++ ) { // Initialize the memory just in case. mem.base = NULL; // Determine if using GPU or not. if ( enabled ) { #ifdef FLA_ENABLE_GPU mem = args->gpu[cache * size + j].obj; #endif } else { mem = args->cache[cache * size + j]; } for ( k = 0; k < t->n_output_args; k++ ) { obj = t->output_arg[k]; if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) obj = *FLASH_OBJ_PTR_AT( obj ); // Return the task if its output block is in cache. if ( mem.base == obj.base ) { t->hit = TRUE; return t; } } } t = t->next_wait; } return args->wait_queue[queue].head; }
void FLASH_Queue_wait_enqueue | ( | FLASH_Task * | t, |
void * | arg | ||
) |
References FLASH_Queue_get_sorting(), FLASH_Queue_s::head, FLASH_Task_s::height, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLASH_Task_s::next_wait, FLASH_Task_s::order, FLASH_Queue_variables::pc, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, and FLASH_Queue_variables::wait_queue.
Referenced by FLASH_Queue_exec_gpu(), FLASH_Queue_exec_simulation(), FLASH_Queue_init_tasks(), FLASH_Task_update_binding(), and FLASH_Task_update_dependencies().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int queue = t->queue; if ( args->wait_queue[queue].n_tasks == 0 ) { args->wait_queue[queue].head = t; args->wait_queue[queue].tail = t; } else { t->prev_wait = args->wait_queue[queue].tail; // Insertion sort of tasks in waiting queue. if ( FLASH_Queue_get_sorting() ) { while ( t->prev_wait != NULL ) { if ( t->prev_wait->height >= t->height ) break; t->next_wait = t->prev_wait; t->prev_wait = t->prev_wait->prev_wait; } } // Checking if the task is the head of the waiting queue. if ( t->prev_wait == NULL ) args->wait_queue[queue].head = t; else t->prev_wait->next_wait = t; // Checking if the task is the tail of the waiting queue. if ( t->next_wait == NULL ) args->wait_queue[queue].tail = t; else t->next_wait->prev_wait = t; } // Increment number of tasks on waiting queue. args->wait_queue[queue].n_tasks++; return; }
FLASH_Task* FLASH_Queue_work_stealing | ( | int | queue, |
void * | arg | ||
) |
References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_s::head, FLASH_Queue_variables::n_queues, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_wait, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, FLASH_Queue_variables::run_lock, FLASH_Queue_s::tail, and FLASH_Queue_variables::wait_queue.
Referenced by FLASH_Queue_exec_parallel_function().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int q; int n_queues = args->n_queues; FLASH_Task* t = NULL; // Do not perform work stealing if there is only one queue. if ( n_queues == 1 ) return t; // Find a random queue not equal to the current queue. do { #ifdef FLA_ENABLE_WINDOWS_BUILD rand_s( &q ); q = q % n_queues; #else #ifdef FLA_ENABLE_TIDSP q = rand() % n_queues; #else q = lrand48() % n_queues; #endif #endif } while ( q == queue ); #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->run_lock[q]) ); // R *** #endif // If there are tasks that this thread can steal. if ( args->wait_queue[q].n_tasks > 0 ) { // Dequeue the last task. t = args->wait_queue[q].tail; if ( args->wait_queue[q].n_tasks == 1 ) { // Clear the queue of its only task. args->wait_queue[q].head = NULL; args->wait_queue[q].tail = NULL; } else { // Adjust pointers in waiting queue. args->wait_queue[q].tail = t->prev_wait; args->wait_queue[q].tail->next_wait = NULL; } // Reset waiting queue data about the stolen task. t->queue = queue; t->prev_wait = NULL; t->next_wait = NULL; args->wait_queue[q].n_tasks--; } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->run_lock[q]) ); // R *** #endif return t; }
void FLASH_Task_free_parallel | ( | FLASH_Task * | t, |
void * | arg | ||
) |
References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::fla_arg, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_num_threads(), FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Dep_s::next_dep, FLASH_Task_s::output_arg, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Queue_variables::war_lock, and FLA_Obj_struct::write_task.
Referenced by FLASH_Queue_exec_parallel_function().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i, j, k; int thread; int n_threads = FLASH_Queue_get_num_threads(); FLASH_Dep* d; FLASH_Dep* next_dep; FLA_Obj obj; // Clearing the last write task in each output block. for ( i = 0; i < t->n_output_args; i++ ) { obj = t->output_arg[i]; // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Clear each block in macroblock. for ( jj = 0; jj < n; jj++ ) for ( kk = 0; kk < m; kk++ ) ( buf + jj * cs + kk )->base->write_task = NULL; } else // Clear regular block. { obj.base->write_task = NULL; } } // Cleaning the last read tasks in each input block. for ( i = 0; i < t->n_input_args; i++ ) { obj = t->input_arg[i]; // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Clear each block in macroblock. for ( jj = 0; jj < n; jj++ ) { for ( kk = 0; kk < m; kk++ ) { obj = *( buf + jj * cs + kk ); thread = obj.base->n_read_blocks % n_threads; FLA_Lock_acquire( &(args->war_lock[thread]) ); // W *** k = obj.base->n_read_tasks; d = obj.base->read_task_head; obj.base->n_read_tasks = 0; obj.base->read_task_head = NULL; obj.base->read_task_tail = NULL; FLA_Lock_release( &(args->war_lock[thread]) ); // W *** for ( j = 0; j < k; j++ ) { next_dep = d->next_dep; FLA_free( d ); d = next_dep; } } } } else // Regular block. { thread = obj.base->n_read_blocks % n_threads; FLA_Lock_acquire( &(args->war_lock[thread]) ); // W *** k = obj.base->n_read_tasks; d = obj.base->read_task_head; obj.base->n_read_tasks = 0; obj.base->read_task_head = NULL; obj.base->read_task_tail = NULL; FLA_Lock_release( &(args->war_lock[thread]) ); // W *** for ( j = 0; j < k; j++ ) { next_dep = d->next_dep; FLA_free( d ); d = next_dep; } } } // Free the dep_arg field of t. d = t->dep_arg_head; for ( i = 0; i < t->n_dep_args; i++ ) { next_dep = d->next_dep; FLA_free( d ); d = next_dep; } // Free the int_arg field of t. FLA_free( t->int_arg ); // Free the fla_arg field of t. FLA_free( t->fla_arg ); // Free the input_arg field of t. FLA_free( t->input_arg ); // Free the output_arg field of t. FLA_free( t->output_arg ); // Finally, free the struct itself. FLA_free( t ); return; }
FLASH_Task* FLASH_Task_update_binding | ( | FLASH_Task * | t, |
FLASH_Task * | r, | ||
void * | arg | ||
) |
References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_sorting(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::height, FLASH_Task_s::hit, FLASH_Task_s::queue, and FLASH_Queue_variables::run_lock.
Referenced by FLASH_Task_update_dependencies().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int queue; if ( r == NULL ) { // There are no tasks on waiting queue, so bind the first task. r = t; r->hit = TRUE; } else { // Swap the binded task for the new ready task. if ( !r->hit || ( FLASH_Queue_get_sorting() && r->height < t->height ) ) { queue = r->queue; r->hit = FALSE; FLA_Lock_acquire( &(args->run_lock[queue]) ); // R *** // Place swapped task back onto waiting queue. FLASH_Queue_wait_enqueue( r, arg ); FLA_Lock_release( &(args->run_lock[queue]) ); // R *** // Bind the new ready task. r = t; r->hit = TRUE; } else // Keep the binded task and enqueue new ready task. { queue = t->queue; FLA_Lock_acquire( &(args->run_lock[queue]) ); // R *** FLASH_Queue_wait_enqueue( t, arg ); FLA_Lock_release( &(args->run_lock[queue]) ); // R *** } } return r; }
FLASH_Task* FLASH_Task_update_dependencies | ( | FLASH_Task * | t, |
void * | arg | ||
) |
References FLASH_Task_s::cache, FLASH_Task_s::dep_arg_head, FLASH_Queue_variables::dep_lock, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_caching(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_update_binding(), FLASH_Task_s::n_dep_args, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Dep_s::next_dep, FLASH_Task_s::order, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::run_lock, and FLASH_Dep_s::task.
Referenced by FLASH_Queue_exec_parallel_function().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i; int q = t->queue; int queue; int thread; int n_threads = FLASH_Queue_get_num_threads(); FLA_Bool caching = FLASH_Queue_get_caching(); FLA_Bool stealing = FLASH_Queue_get_work_stealing(); FLA_Bool available; FLASH_Task* task; FLASH_Task* r = NULL; FLASH_Dep* d = t->dep_arg_head; // Dequeue task to bind to thread if caching is enabled. if ( caching ) { FLA_Lock_acquire( &(args->run_lock[q]) ); // R *** // Obtain task to execute. r = FLASH_Queue_wait_dequeue( q, t->cache, arg ); FLA_Lock_release( &(args->run_lock[q]) ); // R *** } // Check each dependent task. for ( i = 0; i < t->n_dep_args; i++ ) { if ( stealing ) { // Place all dependent tasks onto same queue as predecessor task. d->task->queue = q; } task = d->task; queue = task->queue; thread = task->order % n_threads; FLA_Lock_acquire( &(args->dep_lock[thread]) ); // D *** task->n_ready--; available = ( task->n_ready == 0 ); FLA_Lock_release( &(args->dep_lock[thread]) ); // D *** // Place newly ready tasks on waiting queue. if ( available ) { // If caching is enabled and the task belongs to this thread's queue. if ( caching && q == queue ) { // Determine if there is a new binded task. r = FLASH_Task_update_binding( task, r, arg ); } else { FLA_Lock_acquire( &(args->run_lock[queue]) ); // R *** FLASH_Queue_wait_enqueue( task, arg ); FLA_Lock_release( &(args->run_lock[queue]) ); // R *** } } // Go to the next dep. d = d->next_dep; } return r; }
int RCCE_acquire_lock | ( | int | ) |
Referenced by FLASH_Queue_exec_parallel_function(), FLASH_Queue_init_tasks(), and FLASH_Task_update_dependencies().
int RCCE_release_lock | ( | int | ) |
Referenced by FLASH_Queue_exec_parallel_function(), FLASH_Queue_init_tasks(), and FLASH_Task_update_dependencies().
int RCCE_ue | ( | void | ) |
double RCCE_wtime | ( | void | ) |
Referenced by FLASH_Queue_exec().
void Synch_all | ( | ) |
Referenced by FLASH_Queue_exec().