|
libflame
12600
|
Functions | |
| void | FLASH_Queue_init_gpu (void) |
| void | FLASH_Queue_finalize_gpu (void) |
| FLA_Error | FLASH_Queue_enable_gpu (void) |
| FLA_Error | FLASH_Queue_disable_gpu (void) |
| FLA_Bool | FLASH_Queue_get_enabled_gpu (void) |
| void | FLASH_Queue_set_gpu_num_blocks (dim_t n_blocks) |
| dim_t | FLASH_Queue_get_gpu_num_blocks (void) |
| FLA_Error | FLASH_Queue_bind_gpu (int thread) |
| FLA_Error | FLASH_Queue_alloc_gpu (dim_t size, FLA_Datatype datatype, void **buffer_gpu) |
| FLA_Error | FLASH_Queue_free_gpu (void *buffer_gpu) |
| FLA_Error | FLASH_Queue_write_gpu (FLA_Obj obj, void *buffer_gpu) |
| FLA_Error | FLASH_Queue_read_gpu (FLA_Obj obj, void *buffer_gpu) |
| void | FLASH_Queue_exec_task_gpu (FLASH_Task *t, void **input_arg, void **output_arg) |
| FLA_Error FLASH_Queue_alloc_gpu | ( | dim_t | size, |
| FLA_Datatype | datatype, | ||
| void ** | buffer_gpu | ||
| ) |
References FLA_Obj_datatype_size().
Referenced by FLASH_Queue_create_gpu().
{
cublasStatus status;
// Allocate memory for a block on GPU.
status = cublasAlloc( size,
FLA_Obj_datatype_size( datatype ),
buffer_gpu );
// Check to see if the allocation was successful.
if ( status != CUBLAS_STATUS_SUCCESS )
FLA_Check_error_code( FLA_MALLOC_GPU_RETURNED_NULL_POINTER );
return FLA_SUCCESS;
}
| FLA_Error FLASH_Queue_bind_gpu | ( | int | thread | ) |
Referenced by FLASH_Queue_create_gpu().
{
// Bind a GPU to this thread.
cudaSetDevice( thread );
return FLA_SUCCESS;
}
| FLA_Error FLASH_Queue_disable_gpu | ( | void | ) |
References FLASH_Queue_stack_depth().
{
if ( FLASH_Queue_stack_depth() == 0 )
{
// Disable if not begin parallel region yet.
flash_queue_enabled_gpu = FALSE;
return FLA_SUCCESS;
}
else
{
// Cannot change status during parallel region.
return FLA_FAILURE;
}
}
| FLA_Error FLASH_Queue_enable_gpu | ( | void | ) |
References FLASH_Queue_get_enabled(), and FLASH_Queue_stack_depth().
{
if ( FLASH_Queue_stack_depth() == 0 && FLASH_Queue_get_enabled() )
{
// Enable if not begin parallel region yet and SuperMatrix is enabled.
flash_queue_enabled_gpu = TRUE;
return FLA_SUCCESS;
}
else
{
// Cannot change status during parallel region.
return FLA_FAILURE;
}
}
| void FLASH_Queue_exec_task_gpu | ( | FLASH_Task * | t, |
| void ** | input_arg, | ||
| void ** | output_arg | ||
| ) |
References FLASH_Task_s::fla_arg, FLA_Axpy_external_gpu(), FLA_Axpy_task(), FLA_Copy_external_gpu(), FLA_Copy_task(), FLA_Gemm_external_gpu(), FLA_Gemm_task(), FLA_Gemv_external_gpu(), FLA_Gemv_task(), FLA_Hemm_external_gpu(), FLA_Hemm_task(), FLA_Her2k_external_gpu(), FLA_Her2k_task(), FLA_Herk_external_gpu(), FLA_Herk_task(), FLA_Scal_external_gpu(), FLA_Scal_task(), FLA_Scalr_external_gpu(), FLA_Scalr_task(), FLA_Symm_external_gpu(), FLA_Symm_task(), FLA_Syr2k_external_gpu(), FLA_Syr2k_task(), FLA_Syrk_external_gpu(), FLA_Syrk_task(), FLA_Trmm_external_gpu(), FLA_Trmm_task(), FLA_Trsm_external_gpu(), FLA_Trsm_task(), FLA_Trsv_external_gpu(), FLA_Trsv_task(), FLASH_Task_s::func, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, and FLASH_Task_s::output_arg.
Referenced by FLASH_Queue_exec_gpu().
{
// Define local function pointer types.
// Level-3 BLAS
typedef FLA_Error(*flash_gemm_gpu_p)(FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu);
typedef FLA_Error(*flash_hemm_gpu_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu);
typedef FLA_Error(*flash_herk_gpu_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu);
typedef FLA_Error(*flash_her2k_gpu_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu);
typedef FLA_Error(*flash_symm_gpu_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu);
typedef FLA_Error(*flash_syrk_gpu_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu);
typedef FLA_Error(*flash_syr2k_gpu_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu);
typedef FLA_Error(*flash_trmm_gpu_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj C, void* C_gpu);
typedef FLA_Error(*flash_trsm_gpu_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj C, void* C_gpu);
// Level-2 BLAS
typedef FLA_Error(*flash_gemv_gpu_p)(FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj x, void* x_gpu, FLA_Obj beta, FLA_Obj y, void* y_gpu);
typedef FLA_Error(*flash_trsv_gpu_p)(FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj A, void* A_gpu, FLA_Obj x, void* x_gpu);
// Level-1 BLAS
typedef FLA_Error(*flash_axpy_gpu_p)(FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu);
typedef FLA_Error(*flash_copy_gpu_p)(FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu);
typedef FLA_Error(*flash_scal_gpu_p)(FLA_Obj alpha, FLA_Obj A, void* A_gpu);
typedef FLA_Error(*flash_scalr_gpu_p)(FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, void* A_gpu);
// Only execute task if it is not NULL.
if ( t == NULL )
return;
// Now "switch" between the various possible task functions.
// FLA_Gemm
if ( t->func == (void *) FLA_Gemm_task )
{
flash_gemm_gpu_p func;
func = (flash_gemm_gpu_p) FLA_Gemm_external_gpu;
func( ( FLA_Trans ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
t->fla_arg[0],
t->input_arg[0],
input_arg[0],
t->input_arg[1],
input_arg[1],
t->fla_arg[1],
t->output_arg[0],
output_arg[0] );
}
// FLA_Hemm
else if ( t->func == (void *) FLA_Hemm_task )
{
flash_hemm_gpu_p func;
func = (flash_hemm_gpu_p) FLA_Hemm_external_gpu;
func( ( FLA_Side ) t->int_arg[0],
( FLA_Uplo ) t->int_arg[1],
t->fla_arg[0],
t->input_arg[0],
input_arg[0],
t->input_arg[1],
input_arg[1],
t->fla_arg[1],
t->output_arg[0],
output_arg[0] );
}
// FLA_Herk
else if ( t->func == (void *) FLA_Herk_task )
{
flash_herk_gpu_p func;
func = (flash_herk_gpu_p) FLA_Herk_external_gpu;
func( ( FLA_Uplo ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
t->fla_arg[0],
t->input_arg[0],
input_arg[0],
t->fla_arg[1],
t->output_arg[0],
output_arg[0] );
}
// FLA_Her2k
else if ( t->func == (void *) FLA_Her2k_task )
{
flash_her2k_gpu_p func;
func = (flash_her2k_gpu_p) FLA_Her2k_external_gpu;
func( ( FLA_Uplo ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
t->fla_arg[0],
t->input_arg[0],
input_arg[0],
t->input_arg[1],
input_arg[1],
t->fla_arg[1],
t->output_arg[0],
output_arg[0] );
}
// FLA_Symm
else if ( t->func == (void *) FLA_Symm_task )
{
flash_symm_gpu_p func;
func = (flash_symm_gpu_p) FLA_Symm_external_gpu;
func( ( FLA_Side ) t->int_arg[0],
( FLA_Uplo ) t->int_arg[1],
t->fla_arg[0],
t->input_arg[0],
input_arg[0],
t->input_arg[1],
input_arg[1],
t->fla_arg[1],
t->output_arg[0],
output_arg[0] );
}
// FLA_Syrk
else if ( t->func == (void *) FLA_Syrk_task )
{
flash_syrk_gpu_p func;
func = (flash_syrk_gpu_p) FLA_Syrk_external_gpu;
func( ( FLA_Uplo ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
t->fla_arg[0],
t->input_arg[0],
input_arg[0],
t->fla_arg[1],
t->output_arg[0],
output_arg[0] );
}
// FLA_Syr2k
else if ( t->func == (void *) FLA_Syr2k_task )
{
flash_syr2k_gpu_p func;
func = (flash_syr2k_gpu_p) FLA_Syr2k_external_gpu;
func( ( FLA_Uplo ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
t->fla_arg[0],
t->input_arg[0],
input_arg[0],
t->input_arg[1],
input_arg[1],
t->fla_arg[1],
t->output_arg[0],
output_arg[0] );
}
// FLA_Trmm
else if ( t->func == (void *) FLA_Trmm_task )
{
flash_trmm_gpu_p func;
func = (flash_trmm_gpu_p) FLA_Trmm_external_gpu;
func( ( FLA_Side ) t->int_arg[0],
( FLA_Uplo ) t->int_arg[1],
( FLA_Trans ) t->int_arg[2],
( FLA_Diag ) t->int_arg[3],
t->fla_arg[0],
t->input_arg[0],
input_arg[0],
t->output_arg[0],
output_arg[0] );
}
// FLA_Trsm
else if ( t->func == (void *) FLA_Trsm_task )
{
flash_trsm_gpu_p func;
func = (flash_trsm_gpu_p) FLA_Trsm_external_gpu;
func( ( FLA_Side ) t->int_arg[0],
( FLA_Uplo ) t->int_arg[1],
( FLA_Trans ) t->int_arg[2],
( FLA_Diag ) t->int_arg[3],
t->fla_arg[0],
t->input_arg[0],
input_arg[0],
t->output_arg[0],
output_arg[0] );
}
// FLA_Gemv
else if ( t->func == (void *) FLA_Gemv_task )
{
flash_gemv_gpu_p func;
func = (flash_gemv_gpu_p) FLA_Gemv_external_gpu;
func( ( FLA_Trans ) t->int_arg[0],
t->fla_arg[0],
t->input_arg[0],
input_arg[0],
t->input_arg[1],
input_arg[1],
t->fla_arg[1],
t->output_arg[0],
output_arg[0] );
}
// FLA_Trsv
else if ( t->func == (void *) FLA_Trsv_task )
{
flash_trsv_gpu_p func;
func = (flash_trsv_gpu_p) FLA_Trsv_external_gpu;
func( ( FLA_Uplo ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
( FLA_Diag ) t->int_arg[2],
t->input_arg[0],
input_arg[0],
t->output_arg[0],
output_arg[0] );
}
// FLA_Axpy
else if ( t->func == (void *) FLA_Axpy_task )
{
flash_axpy_gpu_p func;
func = (flash_axpy_gpu_p) FLA_Axpy_external_gpu;
func( t->fla_arg[0],
t->input_arg[0],
input_arg[0],
t->output_arg[0],
output_arg[0] );
}
// FLA_Copy
else if ( t->func == (void *) FLA_Copy_task )
{
flash_copy_gpu_p func;
func = (flash_copy_gpu_p) FLA_Copy_external_gpu;
func( t->input_arg[0],
input_arg[0],
t->output_arg[0],
output_arg[0] );
}
// FLA_Scal
else if ( t->func == (void *) FLA_Scal_task )
{
flash_scal_gpu_p func;
func = (flash_scal_gpu_p) FLA_Scal_external_gpu;
func( t->fla_arg[0],
t->output_arg[0],
output_arg[0] );
}
// FLA_Scalr
else if ( t->func == (void *) FLA_Scalr_task )
{
flash_scalr_gpu_p func;
func = (flash_scalr_gpu_p) FLA_Scalr_external_gpu;
func( ( FLA_Uplo ) t->int_arg[0],
t->fla_arg[0],
t->output_arg[0],
output_arg[0] );
}
else
{
FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED );
}
return;
}
| void FLASH_Queue_finalize_gpu | ( | void | ) |
Referenced by FLASH_Queue_finalize().
{
cublasShutdown();
return;
}
| FLA_Error FLASH_Queue_free_gpu | ( | void * | buffer_gpu | ) |
Referenced by FLASH_Queue_destroy_gpu().
{
// Free memory for a block on GPU.
cublasFree( buffer_gpu );
return FLA_SUCCESS;
}
| FLA_Bool FLASH_Queue_get_enabled_gpu | ( | void | ) |
References FLASH_Queue_get_enabled().
Referenced by FLASH_Queue_create_gpu(), FLASH_Queue_destroy_gpu(), FLASH_Queue_exec_gpu(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_flush_gpu(), FLASH_Queue_wait_dequeue(), and FLASH_Queue_wait_dequeue_block().
{
// Return if SuperMatrix is enabled, but always false if not.
if ( FLASH_Queue_get_enabled() )
return flash_queue_enabled_gpu;
else
return FALSE;
}
| dim_t FLASH_Queue_get_gpu_num_blocks | ( | void | ) |
Referenced by FLASH_Queue_check_block_gpu(), FLASH_Queue_create_gpu(), FLASH_Queue_destroy_gpu(), FLASH_Queue_exec(), FLASH_Queue_flush_block_gpu(), FLASH_Queue_flush_gpu(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_mark_gpu(), FLASH_Queue_update_block_gpu(), and FLASH_Queue_wait_dequeue_block().
{
return flash_queue_gpu_n_blocks;
}
| void FLASH_Queue_init_gpu | ( | void | ) |
Referenced by FLASH_Queue_init().
{
cublasInit();
return;
}
| FLA_Error FLASH_Queue_read_gpu | ( | FLA_Obj | obj, |
| void * | buffer_gpu | ||
| ) |
References FLA_Obj_buffer_at_view(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_datatype_size(), FLA_Obj_length(), and FLA_Obj_width().
Referenced by FLASH_Queue_destroy_gpu(), FLASH_Queue_flush_block_gpu(), FLASH_Queue_flush_gpu(), and FLASH_Queue_update_block_gpu().
{
// Read the memory of a block on GPU to main memory.
cublasGetMatrix( FLA_Obj_length( obj ),
FLA_Obj_width( obj ),
FLA_Obj_datatype_size( FLA_Obj_datatype( obj ) ),
buffer_gpu,
FLA_Obj_length( obj ),
FLA_Obj_buffer_at_view( obj ),
FLA_Obj_col_stride( obj ) );
return FLA_SUCCESS;
}
| void FLASH_Queue_set_gpu_num_blocks | ( | dim_t | n_blocks | ) |
{
flash_queue_gpu_n_blocks = n_blocks;
return;
}
| FLA_Error FLASH_Queue_write_gpu | ( | FLA_Obj | obj, |
| void * | buffer_gpu | ||
| ) |
References FLA_Obj_buffer_at_view(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_datatype_size(), FLA_Obj_length(), and FLA_Obj_width().
Referenced by FLASH_Queue_update_block_gpu().
{
// Write the contents of a block in main memory to GPU.
cublasSetMatrix( FLA_Obj_length( obj ),
FLA_Obj_width( obj ),
FLA_Obj_datatype_size( FLA_Obj_datatype( obj ) ),
FLA_Obj_buffer_at_view( obj ),
FLA_Obj_col_stride( obj ),
buffer_gpu,
FLA_Obj_length( obj ) );
return FLA_SUCCESS;
}
1.7.6.1