| 
    libflame
    12600
    
   
   | 
  
  
  
 
| void FLASH_Queue_begin | ( | void | ) | 
References FLA_Clock().
Referenced by FLASH_Apply_CAQ_UT_inc(), FLASH_Apply_Q2_UT(), FLASH_Apply_Q_UT(), FLASH_Apply_Q_UT_inc(), FLASH_Apply_QUD_UT_inc(), FLASH_CAQR_UT_inc_noopt(), FLASH_Chol(), FLASH_Copy(), FLASH_Copyr(), FLASH_Eig_gest(), FLASH_Gemm(), FLASH_Hemm(), FLASH_Her2k(), FLASH_Herk(), FLASH_LQ_UT(), FLASH_LU_incpiv_noopt(), FLASH_LU_incpiv_opt1(), FLASH_LU_nopiv(), FLASH_LU_piv(), FLASH_Lyap(), FLASH_QR2_UT(), FLASH_QR_UT(), FLASH_QR_UT_inc_noopt(), FLASH_QR_UT_inc_opt1(), FLASH_SPDinv(), FLASH_Sylv(), FLASH_Symm(), FLASH_Syr2k(), FLASH_Syrk(), FLASH_Trinv(), FLASH_Trmm(), FLASH_Trsm(), FLASH_Ttmm(), and FLASH_UDdate_UT_inc().
{
#ifdef FLA_ENABLE_SUPERMATRIX
   if ( flash_queue_stack == 0 )
   {
      // Save the starting time for the total execution time.
      flash_queue_total_time = FLA_Clock();
   }
#endif
   // Push onto the stack.
   flash_queue_stack++;
   return;
}
| FLA_Error FLASH_Queue_disable | ( | void | ) | 
Referenced by FLASH_Apply_pivots(), FLASH_Axpy(), FLASH_Axpyt(), FLASH_Copyt(), FLASH_FS_incpiv(), FLASH_Gemv(), FLASH_Scal(), FLASH_Scalr(), and FLASH_Trsv().
{
#ifdef FLA_ENABLE_SUPERMATRIX
   if ( flash_queue_stack == 0 )
   {
      // Disable if not begin parallel region yet.
      flash_queue_enabled = FALSE;   
      return FLA_SUCCESS;      
   }
   else
   {
      // Cannot change status during parallel region.
      return FLA_FAILURE;
   }
#else
   // Allow disabling enqueuing even when SuperMatrix is not configured.
   flash_queue_enabled = FALSE;   
   return FLA_SUCCESS;
#endif
}
| FLA_Error FLASH_Queue_enable | ( | void | ) | 
Referenced by FLASH_Apply_pivots(), FLASH_Axpy(), FLASH_Axpyt(), FLASH_Copyt(), FLASH_FS_incpiv(), FLASH_Gemv(), FLASH_Scal(), FLASH_Scalr(), and FLASH_Trsv().
{
#ifdef FLA_ENABLE_SUPERMATRIX
   if ( flash_queue_stack == 0 )
   {
      // Enable if not begin parallel region yet.
      flash_queue_enabled = TRUE;   
      return FLA_SUCCESS;
   }
   else
   {
      // Cannot change status during parallel region.
      return FLA_FAILURE;
   }
#else
   // Raise an exception when SuperMatrix is not configured.
   FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED );
   return FLA_FAILURE;
#endif
}
| void FLASH_Queue_end | ( | void | ) | 
References FLA_Clock(), and FLASH_Queue_exec().
Referenced by FLASH_Apply_CAQ_UT_inc(), FLASH_Apply_Q2_UT(), FLASH_Apply_Q_UT(), FLASH_Apply_Q_UT_inc(), FLASH_Apply_QUD_UT_inc(), FLASH_CAQR_UT_inc_noopt(), FLASH_Chol(), FLASH_Copy(), FLASH_Copyr(), FLASH_Eig_gest(), FLASH_Gemm(), FLASH_Hemm(), FLASH_Her2k(), FLASH_Herk(), FLASH_LQ_UT(), FLASH_LU_incpiv_noopt(), FLASH_LU_incpiv_opt1(), FLASH_LU_nopiv(), FLASH_LU_piv(), FLASH_Lyap(), FLASH_QR2_UT(), FLASH_QR_UT(), FLASH_QR_UT_inc_noopt(), FLASH_QR_UT_inc_opt1(), FLASH_SPDinv(), FLASH_Sylv(), FLASH_Symm(), FLASH_Syr2k(), FLASH_Syrk(), FLASH_Trinv(), FLASH_Trmm(), FLASH_Trsm(), FLASH_Ttmm(), and FLASH_UDdate_UT_inc().
{
   // Pop off the stack.
   flash_queue_stack--;
#ifdef FLA_ENABLE_SUPERMATRIX
   if ( flash_queue_stack == 0 )
   {
      // Execute tasks if encounter the outermost parallel region.
      FLASH_Queue_exec();
      // Find the total execution time.
      flash_queue_total_time = FLA_Clock() - flash_queue_total_time;
   }
#endif
   return;
}
| void FLASH_Queue_exec_task | ( | FLASH_Task * | t | ) | 
References FLASH_Task_s::cntl, FLA_Apply_CAQ2_UT_task(), FLA_Apply_pivots_macro_task(), FLA_Apply_Q2_UT_task(), FLA_Apply_Q_UT_task(), FLA_Apply_QUD_UT_task(), FLASH_Task_s::fla_arg, FLA_Axpy_task(), FLA_Axpyt_task(), FLA_CAQR2_UT_task(), FLA_Chol_task(), FLA_Copy_task(), FLA_Copyr_task(), FLA_Copyt_task(), FLA_Eig_gest_task(), FLA_Gemm_task(), FLA_Gemv_task(), FLA_Hemm_task(), FLA_Her2k_task(), FLA_Herk_task(), FLA_LQ_UT_macro_task(), FLA_LU_nopiv_task(), FLA_LU_piv_copy_task(), FLA_LU_piv_macro_task(), FLA_LU_piv_task(), FLA_Lyap_task(), FLA_Obj_create_buffer_task(), FLA_Obj_free_buffer_task(), FLA_QR2_UT_task(), FLA_QR_UT_copy_task(), FLA_QR_UT_macro_task(), FLA_QR_UT_task(), FLA_SA_FS_task(), FLA_SA_LU_task(), FLA_Scal_task(), FLA_Scalr_task(), FLA_Sylv_task(), FLA_Symm_task(), FLA_Syr2k_task(), FLA_Syrk_task(), FLA_Trinv_task(), FLA_Trmm_task(), FLA_Trsm_piv_task(), FLA_Trsm_task(), FLA_Trsv_task(), FLA_Ttmm_task(), FLA_UDdate_UT_task(), FLASH_Task_s::func, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, and FLASH_Task_s::output_arg.
Referenced by FLASH_Queue_exec_gpu(), FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().
{
   // Define local function pointer types.
   // LAPACK-level
   typedef FLA_Error(*flash_lu_piv_macro_p)(FLA_Obj A, FLA_Obj p, fla_lu_t* cntl );
   typedef FLA_Error(*flash_apply_pivots_macro_p)(FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl);
   typedef FLA_Error(*flash_lu_piv_p)(FLA_Obj A, FLA_Obj p, fla_lu_t* cntl);
   typedef FLA_Error(*flash_lu_piv_copy_p)(FLA_Obj A, FLA_Obj p, FLA_Obj U, fla_lu_t* cntl);
   typedef FLA_Error(*flash_trsm_piv_p)(FLA_Obj A, FLA_Obj C, FLA_Obj p, fla_trsm_t* cntl);
   typedef FLA_Error(*flash_sa_lu_p)(FLA_Obj U, FLA_Obj D, FLA_Obj p, FLA_Obj L, int nb_alg, fla_lu_t* cntl);
   typedef FLA_Error(*flash_sa_fs_p)(FLA_Obj L, FLA_Obj D, FLA_Obj p, FLA_Obj C, FLA_Obj E, int nb_alg, fla_gemm_t* cntl);
   typedef FLA_Error(*flash_lu_nopiv_p)(FLA_Obj A, fla_lu_t* cntl);
   typedef FLA_Error(*flash_trinv_p)(FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A, fla_trinv_t* cntl);
   typedef FLA_Error(*flash_ttmm_p)(FLA_Uplo uplo, FLA_Obj A, fla_ttmm_t* cntl);
   typedef FLA_Error(*flash_chol_p)(FLA_Uplo uplo, FLA_Obj A, fla_chol_t* cntl);
   typedef FLA_Error(*flash_sylv_p)(FLA_Trans transa, FLA_Trans transb, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl);
   typedef FLA_Error(*flash_lyap_p)(FLA_Trans trans, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl);
   typedef FLA_Error(*flash_qrut_macro_p)(FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl);
   typedef FLA_Error(*flash_qrut_p)(FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl);
   typedef FLA_Error(*flash_qrutc_p)(FLA_Obj A, FLA_Obj T, FLA_Obj U, fla_qrut_t* cntl);
   typedef FLA_Error(*flash_qr2ut_p)(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_qr2ut_t* cntl);
   typedef FLA_Error(*flash_lqut_macro_p)(FLA_Obj A, FLA_Obj T, fla_lqut_t* cntl);
   typedef FLA_Error(*flash_caqr2ut_p)(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_caqr2ut_t* cntl);
   typedef FLA_Error(*flash_uddateut_p)(FLA_Obj R, FLA_Obj C, FLA_Obj D, FLA_Obj T, fla_uddateut_t* cntl);
   typedef FLA_Error(*flash_apqut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl);
   typedef FLA_Error(*flash_apq2ut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apq2ut_t* cntl);
   typedef FLA_Error(*flash_apcaq2ut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apcaq2ut_t* cntl);
   typedef FLA_Error(*flash_apqudut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj T, FLA_Obj W, FLA_Obj R, FLA_Obj U, FLA_Obj C, FLA_Obj V, FLA_Obj D, fla_apqudut_t* cntl);
   typedef FLA_Error(*flash_eig_gest_p)(FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl);
   // Level-3 BLAS
   typedef FLA_Error(*flash_gemm_p)(FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl);
   typedef FLA_Error(*flash_hemm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl);
   typedef FLA_Error(*flash_herk_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl);
   typedef FLA_Error(*flash_her2k_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl);
   typedef FLA_Error(*flash_symm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl);
   typedef FLA_Error(*flash_syrk_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl);
   typedef FLA_Error(*flash_syr2k_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl);
   typedef FLA_Error(*flash_trmm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj C, fla_trmm_t* cntl);
   typedef FLA_Error(*flash_trsm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj C, fla_trsm_t* cntl);
   // Level-2 BLAS
   typedef FLA_Error(*flash_gemv_p)(FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl);
   typedef FLA_Error(*flash_trsv_p)(FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl);
   // Level-1 BLAS
   typedef FLA_Error(*flash_axpy_p)(FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpy_t* cntl);
   typedef FLA_Error(*flash_axpyt_p)(FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl);
   typedef FLA_Error(*flash_copy_p)(FLA_Obj A, FLA_Obj B, fla_copy_t* cntl);
   typedef FLA_Error(*flash_copyt_p)(FLA_Trans trans, FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl);
   typedef FLA_Error(*flash_copyr_p)(FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl);
   typedef FLA_Error(*flash_scal_p)(FLA_Obj alpha, FLA_Obj A, fla_scal_t* cntl);
   typedef FLA_Error(*flash_scalr_p)(FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl);
   // Base
   typedef FLA_Error(*flash_obj_create_buffer_p)(dim_t rs, dim_t cs, FLA_Obj A, void* cntl);
   typedef FLA_Error(*flash_obj_free_buffer_p)(FLA_Obj A, void* cntl);
   // Only execute task if it is not NULL.
   if ( t == NULL )
      return;
   // Now "switch" between the various possible task functions.
   // FLA_LU_piv_macro
   if ( t->func == (void *) FLA_LU_piv_macro_task )
   {
      flash_lu_piv_macro_p func;
      func = (flash_lu_piv_macro_p) t->func;
      
      func(               t->output_arg[0],
                          t->output_arg[1],
            ( fla_lu_t* ) t->cntl );
   }
   // FLA_Apply_pivots_macro
   else if ( t->func == (void *) FLA_Apply_pivots_macro_task )
   {
      flash_apply_pivots_macro_p func;
      func = (flash_apply_pivots_macro_p) t->func;
      
      func( ( FLA_Side  )    t->int_arg[0],
            ( FLA_Trans )    t->int_arg[1],
                             t->input_arg[0],
                             t->output_arg[0],
            ( fla_appiv_t* ) t->cntl );
   }
   // FLA_LU_piv
   else if ( t->func == (void *) FLA_LU_piv_task )
   {
      flash_lu_piv_p func;
      func = (flash_lu_piv_p) t->func;
      
      func(               t->output_arg[0],
                          t->fla_arg[0],
            ( fla_lu_t* ) t->cntl );
   }
   // FLA_LU_piv_copy
   else if ( t->func == (void *) FLA_LU_piv_copy_task )
   {
      flash_lu_piv_copy_p func;
      func = (flash_lu_piv_copy_p) t->func;
      func(               t->output_arg[0],
                          t->fla_arg[0],
                          t->output_arg[1],
            ( fla_lu_t* ) t->cntl );
   }
   // FLA_Trsm_piv
   else if ( t->func == (void *) FLA_Trsm_piv_task )
   {
      flash_trsm_piv_p func;
      func = (flash_trsm_piv_p) t->func;
      func(                 t->input_arg[0],
                            t->output_arg[0],
                            t->fla_arg[0],
            ( fla_trsm_t* ) t->cntl );
   }
   // FLA_SA_LU
   else if ( t->func == (void *) FLA_SA_LU_task )
   {
      flash_sa_lu_p func;
      func = (flash_sa_lu_p) t->func;
      
      func(               t->output_arg[1],
                          t->output_arg[0],
                          t->fla_arg[0],
                          t->fla_arg[1],
                          t->int_arg[0],
            ( fla_lu_t* ) t->cntl );
   }
   // FLA_SA_FS
   else if ( t->func == (void *) FLA_SA_FS_task )
   {
      flash_sa_fs_p func;
      func = (flash_sa_fs_p) t->func;
         
      func(                 t->fla_arg[0],
                            t->input_arg[0],
                            t->fla_arg[1],                          
                            t->output_arg[1],
                            t->output_arg[0],
                            t->int_arg[0],
            ( fla_gemm_t* ) t->cntl );
   }
   // FLA_LU_nopiv
   else if ( t->func == (void *) FLA_LU_nopiv_task )
   {
      flash_lu_nopiv_p func;
      func = (flash_lu_nopiv_p) t->func;
      func(               t->output_arg[0],
            ( fla_lu_t* ) t->cntl );
   }
   // FLA_Trinv
   else if ( t->func == (void *) FLA_Trinv_task )
   {
      flash_trinv_p func;
      func = (flash_trinv_p) t->func;
      
      func( ( FLA_Uplo     ) t->int_arg[0],
            ( FLA_Diag     ) t->int_arg[1],
                             t->output_arg[0],
            ( fla_trinv_t* ) t->cntl );
   }
   // FLA_Ttmm
   else if ( t->func == (void *) FLA_Ttmm_task )
   {   
      flash_ttmm_p func;
      func = (flash_ttmm_p) t->func;
      
      func( ( FLA_Uplo    ) t->int_arg[0],
                            t->output_arg[0],
            ( fla_ttmm_t* ) t->cntl );
   }
   // FLA_Chol
   else if ( t->func == (void *) FLA_Chol_task )
   {   
      flash_chol_p func;
      func = (flash_chol_p) t->func;
      
      func( ( FLA_Uplo    ) t->int_arg[0],
                            t->output_arg[0],
            ( fla_chol_t* ) t->cntl );
   }
   // FLA_Sylv
   else if ( t->func == (void *) FLA_Sylv_task )
   {   
      flash_sylv_p func;
      func = (flash_sylv_p) t->func;
      
      func( ( FLA_Trans   ) t->int_arg[0],
            ( FLA_Trans   ) t->int_arg[1],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->input_arg[1],
                            t->output_arg[0],
                            t->fla_arg[1],
            ( fla_sylv_t* ) t->cntl );
   }
   // FLA_Lyap
   else if ( t->func == (void *) FLA_Lyap_task )
   {   
      flash_lyap_p func;
      func = (flash_lyap_p) t->func;
      
      func( ( FLA_Trans   ) t->int_arg[0],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->output_arg[0],
                            t->fla_arg[1],
            ( fla_lyap_t* ) t->cntl );
   }
   // FLA_QR_UT_macro
   else if ( t->func == (void *) FLA_QR_UT_macro_task )
   {   
      flash_qrut_macro_p func;
      func = (flash_qrut_macro_p) t->func;
      
      func(                 t->output_arg[0],
                            t->output_arg[1],
            ( fla_qrut_t* ) t->cntl );
   }
   // FLA_QR_UT
   else if ( t->func == (void *) FLA_QR_UT_task )
   {   
      flash_qrut_p func;
      func = (flash_qrut_p) t->func;
      
      func(                 t->output_arg[0],
                            t->fla_arg[0],
            ( fla_qrut_t* ) t->cntl );
   }
   // FLA_QR_UT_copy
   else if ( t->func == (void *) FLA_QR_UT_copy_task )
   {   
      flash_qrutc_p func;
      func = (flash_qrutc_p) t->func;
      func(                 t->output_arg[0],
                            t->fla_arg[0],
                            t->output_arg[1],
            ( fla_qrut_t* ) t->cntl );
   }
   // FLA_QR2_UT
   else if ( t->func == (void *) FLA_QR2_UT_task )
   {   
      flash_qr2ut_p func;
      func = (flash_qr2ut_p) t->func;
         
      func(                 t->output_arg[1],
                            t->output_arg[0],
                            t->fla_arg[0],
           ( fla_qr2ut_t* ) t->cntl );
   }
   // FLA_LQ_UT_macro
   else if ( t->func == (void *) FLA_LQ_UT_macro_task )
   {   
      flash_lqut_macro_p func;
      func = (flash_lqut_macro_p) t->func;
      
      func(                 t->output_arg[0],
                            t->output_arg[1],
            ( fla_lqut_t* ) t->cntl );
   }
   // FLA_CAQR2_UT
   else if ( t->func == (void *) FLA_CAQR2_UT_task )
   {   
      flash_caqr2ut_p func;
      func = (flash_caqr2ut_p) t->func;
         
      func(                 t->output_arg[1],
                            t->output_arg[0],
                            t->fla_arg[0],
         ( fla_caqr2ut_t* ) t->cntl );
   }
   // FLA_UDdate_UT
   else if ( t->func == (void *) FLA_UDdate_UT_task )
   {   
      flash_uddateut_p func;
      func = (flash_uddateut_p) t->func;
      
      func(                 t->output_arg[0],
                            t->output_arg[1],
                            t->output_arg[2],
                            t->output_arg[3],
        ( fla_uddateut_t* ) t->cntl );
   }
   // FLA_Apply_Q_UT
   else if ( t->func == (void *) FLA_Apply_Q_UT_task )
   {   
      flash_apqut_p func;
      func = (flash_apqut_p) t->func;
      
      func( ( FLA_Side     ) t->int_arg[0],
            ( FLA_Trans    ) t->int_arg[1],
            ( FLA_Direct   ) t->int_arg[2],
            ( FLA_Store    ) t->int_arg[3],
                             t->input_arg[0],
                             t->fla_arg[0],
                             t->output_arg[1],
                             t->output_arg[0],
            ( fla_apqut_t* ) t->cntl );
   }
   // FLA_Apply_Q2_UT
   else if ( t->func == (void *) FLA_Apply_Q2_UT_task )
   {   
      flash_apq2ut_p func;
      func = (flash_apq2ut_p) t->func;
      
      func( ( FLA_Side      ) t->int_arg[0],
            ( FLA_Trans     ) t->int_arg[1],
            ( FLA_Direct    ) t->int_arg[2],
            ( FLA_Store     ) t->int_arg[3],
                              t->input_arg[0],
                              t->fla_arg[0],
                              t->output_arg[2],
                              t->output_arg[1],
                              t->output_arg[0],
            ( fla_apq2ut_t* ) t->cntl );
   }
   // FLA_Apply_CAQ2_UT
   else if ( t->func == (void *) FLA_Apply_CAQ2_UT_task )
   {   
      flash_apcaq2ut_p func;
      func = (flash_apcaq2ut_p) t->func;
      
      func( ( FLA_Side      ) t->int_arg[0],
            ( FLA_Trans     ) t->int_arg[1],
            ( FLA_Direct    ) t->int_arg[2],
            ( FLA_Store     ) t->int_arg[3],
                              t->input_arg[0],
                              t->fla_arg[0],
                              t->output_arg[2],
                              t->output_arg[1],
                              t->output_arg[0],
          ( fla_apcaq2ut_t* ) t->cntl );
   }
   // FLA_Apply_QUD_UT
   else if ( t->func == (void *) FLA_Apply_QUD_UT_task )
   {   
      flash_apqudut_p func;
      func = (flash_apqudut_p) t->func;
      
      func( ( FLA_Side       ) t->int_arg[0],
            ( FLA_Trans      ) t->int_arg[1],
            ( FLA_Direct     ) t->int_arg[2],
            ( FLA_Store      ) t->int_arg[3],
                               t->input_arg[0],
                               t->output_arg[0],
                               t->output_arg[1],
                               t->input_arg[1],
                               t->output_arg[2],
                               t->input_arg[2],
                               t->output_arg[3],
            ( fla_apqudut_t* ) t->cntl );
   }
   // FLA_Eig_gest
   else if ( t->func == (void *) FLA_Eig_gest_task )
   {   
      flash_eig_gest_p func;
      func = (flash_eig_gest_p) t->func;
      
      func( ( FLA_Inv         ) t->int_arg[0],
            ( FLA_Uplo        ) t->int_arg[1],
                                t->output_arg[1],
                                t->output_arg[0],
                                t->input_arg[0],
            ( fla_eig_gest_t* ) t->cntl );
   }
   // FLA_Gemm
   else if ( t->func == (void *) FLA_Gemm_task )
   {
      flash_gemm_p func;
      func = (flash_gemm_p) t->func;
      
      func( ( FLA_Trans   ) t->int_arg[0],
            ( FLA_Trans   ) t->int_arg[1],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->input_arg[1],
                            t->fla_arg[1],
                            t->output_arg[0],
            ( fla_gemm_t* ) t->cntl );
   }
   // FLA_Hemm
   else if ( t->func == (void *) FLA_Hemm_task )
   {
      flash_hemm_p func;
      func = (flash_hemm_p) t->func;
      
      func( ( FLA_Side    ) t->int_arg[0],
            ( FLA_Uplo    ) t->int_arg[1],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->input_arg[1],
                            t->fla_arg[1],
                            t->output_arg[0],
            ( fla_hemm_t* ) t->cntl );
   }
   // FLA_Herk
   else if ( t->func == (void *) FLA_Herk_task )
   {
      flash_herk_p func;
      func = (flash_herk_p) t->func;
      
      func( ( FLA_Uplo    ) t->int_arg[0],
            ( FLA_Trans   ) t->int_arg[1],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->fla_arg[1],
                            t->output_arg[0],
            ( fla_herk_t* ) t->cntl );
   }
   // FLA_Her2k
   else if ( t->func == (void *) FLA_Her2k_task )
   {
      flash_her2k_p func;
      func = (flash_her2k_p) t->func;
      
      func( ( FLA_Uplo     ) t->int_arg[0],
            ( FLA_Trans    ) t->int_arg[1],
                             t->fla_arg[0],
                             t->input_arg[0],
                             t->input_arg[1],
                             t->fla_arg[1],
                             t->output_arg[0],
            ( fla_her2k_t* ) t->cntl );
   }
   // FLA_Symm
   else if ( t->func == (void *) FLA_Symm_task )
   {
      flash_symm_p func;
      func = (flash_symm_p) t->func;
      
      func( ( FLA_Side    ) t->int_arg[0],
            ( FLA_Uplo    ) t->int_arg[1],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->input_arg[1],
                            t->fla_arg[1],
                            t->output_arg[0],
            ( fla_symm_t* ) t->cntl );
   }
   // FLA_Syrk
   else if ( t->func == (void *) FLA_Syrk_task )
   {
      flash_syrk_p func;
      func = (flash_syrk_p) t->func;
      
      func( ( FLA_Uplo    ) t->int_arg[0],
            ( FLA_Trans   ) t->int_arg[1],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->fla_arg[1],
                            t->output_arg[0],
            ( fla_syrk_t* ) t->cntl );
   }
   // FLA_Syr2k
   else if ( t->func == (void *) FLA_Syr2k_task )
   {
      flash_syr2k_p func;
      func = (flash_syr2k_p) t->func;
      
      func( ( FLA_Uplo     ) t->int_arg[0],
            ( FLA_Trans    ) t->int_arg[1],
                             t->fla_arg[0],
                             t->input_arg[0],
                             t->input_arg[1],
                             t->fla_arg[1],
                             t->output_arg[0],
            ( fla_syr2k_t* ) t->cntl );
   }
   // FLA_Trmm
   else if ( t->func == (void *) FLA_Trmm_task )
   {
      flash_trmm_p func;
      func = (flash_trmm_p) t->func;
      
      func( ( FLA_Side    ) t->int_arg[0],
            ( FLA_Uplo    ) t->int_arg[1],
            ( FLA_Trans   ) t->int_arg[2],
            ( FLA_Diag    ) t->int_arg[3],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->output_arg[0],
            ( fla_trmm_t* ) t->cntl );
   }
   // FLA_Trsm
   else if ( t->func == (void *) FLA_Trsm_task )
   {
      flash_trsm_p func;
      func = (flash_trsm_p) t->func;
      
      func( ( FLA_Side    ) t->int_arg[0],
            ( FLA_Uplo    ) t->int_arg[1],
            ( FLA_Trans   ) t->int_arg[2],
            ( FLA_Diag    ) t->int_arg[3],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->output_arg[0],
            ( fla_trsm_t* ) t->cntl );
   }
   // FLA_Gemv
   else if ( t->func == (void *) FLA_Gemv_task )
   {
      flash_gemv_p func;
      func = (flash_gemv_p) t->func;
      
      func( ( FLA_Trans   ) t->int_arg[0],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->input_arg[1],
                            t->fla_arg[1],
                            t->output_arg[0],
            ( fla_gemv_t* ) t->cntl );
   }
   // FLA_Trsv
   else if ( t->func == (void *) FLA_Trsv_task )
   {
      flash_trsv_p func;
      func = (flash_trsv_p) t->func;
      
      func( ( FLA_Uplo    ) t->int_arg[0],
            ( FLA_Trans   ) t->int_arg[1],
            ( FLA_Diag    ) t->int_arg[2],
                            t->input_arg[0],
                            t->output_arg[0],
            ( fla_trsv_t* ) t->cntl );
   }
   // FLA_Axpy
   else if ( t->func == (void *) FLA_Axpy_task )
   {
      flash_axpy_p func;
      func = (flash_axpy_p) t->func;
         
      func(                 t->fla_arg[0],
                            t->input_arg[0],
                            t->output_arg[0],
            ( fla_axpy_t* ) t->cntl );
   }
   // FLA_Axpyt
   else if ( t->func == (void *) FLA_Axpyt_task )
   {
      flash_axpyt_p func;
      func = (flash_axpyt_p) t->func;
      
      func( ( FLA_Trans    ) t->int_arg[0],
                             t->fla_arg[0],
                             t->input_arg[0],
                             t->output_arg[0],
            ( fla_axpyt_t* ) t->cntl );
   }
   // FLA_Copy
   else if ( t->func == (void *) FLA_Copy_task )
   {
      flash_copy_p func;
      func = (flash_copy_p) t->func;
         
      func(                 t->input_arg[0],
                            t->output_arg[0],
            ( fla_copy_t* ) t->cntl );
   }
   // FLA_Copyt
   else if ( t->func == (void *) FLA_Copyt_task )
   {
      flash_copyt_p func;
      func = (flash_copyt_p) t->func;
      
      func( ( FLA_Trans    ) t->int_arg[0],
                             t->input_arg[0],
                             t->output_arg[0],
            ( fla_copyt_t* ) t->cntl );
   }
   // FLA_Copyr
   else if ( t->func == (void *) FLA_Copyr_task )
   {
      flash_copyr_p func;
      func = (flash_copyr_p) t->func;
      
      func( ( FLA_Uplo     ) t->int_arg[0],
                             t->input_arg[0],
                             t->output_arg[0],
            ( fla_copyr_t* ) t->cntl );
   }
   // FLA_Scal
   else if ( t->func == (void *) FLA_Scal_task )
   {
      flash_scal_p func;
      func = (flash_scal_p) t->func;
      func(                 t->fla_arg[0],
                            t->output_arg[0],
            ( fla_scal_t* ) t->cntl );
   }
   // FLA_Scalr
   else if ( t->func == (void *) FLA_Scalr_task )
   {
      flash_scalr_p func;
      func = (flash_scalr_p) t->func;
      func( ( FLA_Uplo     ) t->int_arg[0],
                             t->fla_arg[0],
                             t->output_arg[0],
            ( fla_scalr_t* ) t->cntl );
   }
   // FLA_Obj_create_buffer
   else if ( t->func == (void *) FLA_Obj_create_buffer_task )
   {
      flash_obj_create_buffer_p func;
      func = (flash_obj_create_buffer_p) t->func;
      func( ( dim_t       ) t->int_arg[0],
            ( dim_t       ) t->int_arg[1],
                            t->output_arg[0],
                            t->cntl );
   }
   // FLA_Obj_free_buffer
   else if ( t->func == (void *) FLA_Obj_free_buffer_task )
   {
      flash_obj_free_buffer_p func;
      func = (flash_obj_free_buffer_p) t->func;
      func(                 t->output_arg[0],
                            t->cntl );
   }
   else
   {
      FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED );
   }
   
   return;
}
| void FLASH_Queue_finalize | ( | void | ) | 
References FLASH_Queue_finalize_gpu().
Referenced by FLA_Finalize().
{
   // Exit early if we're not already initialized.
   if ( flash_queue_initialized == FALSE )
      return;
   // Clear the initialized flag.
   flash_queue_initialized = FALSE;
#ifdef FLA_ENABLE_GPU
   FLASH_Queue_finalize_gpu();
#endif
   return;
}
| dim_t FLASH_Queue_get_block_size | ( | void | ) | 
Referenced by FLASH_Queue_exec().
{
   return flash_queue_block_size;
}
| dim_t FLASH_Queue_get_cache_line_size | ( | void | ) | 
Referenced by FLASH_Queue_prefetch_block().
{
   return flash_queue_cache_line_size;
}
| dim_t FLASH_Queue_get_cache_size | ( | void | ) | 
Referenced by FLASH_Queue_exec().
{
   return flash_queue_cache_size;
}
| FLA_Bool FLASH_Queue_get_caching | ( | void | ) | 
Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_wait_dequeue(), and FLASH_Task_update_dependencies().
{ 
   return flash_queue_caching;
}
| int FLASH_Queue_get_cores_per_cache | ( | void | ) | 
Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().
{
   return flash_queue_cores_per_cache;
}
| int FLASH_Queue_get_cores_per_queue | ( | void | ) | 
Referenced by FLASH_Queue_exec().
{
   return flash_queue_cores_per_queue;
}
Referenced by FLASH_Queue_exec(), FLASH_Queue_init_tasks(), and FLASH_Queue_verbose_output().
{ 
   return flash_queue_data_affinity;
}
| FLA_Bool FLASH_Queue_get_enabled | ( | void | ) | 
Referenced by FLA_Apply_CAQ2_UT_internal(), FLA_Apply_pivots_internal(), FLA_Apply_Q2_UT_internal(), FLA_Apply_Q_UT_internal(), FLA_Apply_QUD_UT_internal(), FLA_Axpy_internal(), FLA_Axpyt_internal(), FLA_CAQR2_UT_internal(), FLA_Chol_internal(), FLA_Copy_internal(), FLA_Copyr_internal(), FLA_Copyt_internal(), FLA_Eig_gest_internal(), FLA_Gemm_internal(), FLA_Gemv_internal(), FLA_Hemm_internal(), FLA_Her2k_internal(), FLA_Herk_internal(), FLA_LQ_UT_internal(), FLA_LU_nopiv_internal(), FLA_LU_piv_internal(), FLA_Lyap_internal(), FLA_QR2_UT_internal(), FLA_QR_UT_copy_internal(), FLA_QR_UT_internal(), FLA_Scal_internal(), FLA_Scalr_internal(), FLA_Sylv_internal(), FLA_Symm_internal(), FLA_Syr2k_internal(), FLA_Syrk_internal(), FLA_Trinv_internal(), FLA_Trmm_internal(), FLA_Trsm_internal(), FLA_Trsv_internal(), FLA_Ttmm_internal(), FLA_UDdate_UT_internal(), FLASH_Apply_pivots(), FLASH_Axpy(), FLASH_Axpyt(), FLASH_Copyt(), FLASH_FS_incpiv(), FLASH_Gemv(), FLASH_LU_incpiv_var1(), FLASH_LU_incpiv_var2(), FLASH_Queue_enable_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_SA_FS(), FLASH_SA_LU(), FLASH_Scal(), FLASH_Scalr(), FLASH_Trsm_piv(), and FLASH_Trsv().
{
   // Return if enabled, but always false if SuperMatrix is not configured.
#ifdef FLA_ENABLE_SUPERMATRIX
   return flash_queue_enabled;
#else
   return FALSE;
#endif
}
| FLASH_Task* FLASH_Queue_get_head_task | ( | void | ) | 
References FLASH_Queue_s::head.
Referenced by FLASH_Queue_init_tasks(), and FLASH_Queue_verbose_output().
| unsigned int FLASH_Queue_get_num_tasks | ( | void | ) | 
References FLASH_Queue_s::n_tasks.
Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), FLASH_Queue_init_tasks(), and FLASH_Queue_verbose_output().
| unsigned int FLASH_Queue_get_num_threads | ( | void | ) | 
Referenced by FLASH_Queue_check_gpu(), FLASH_Queue_exec(), FLASH_Queue_exec_gpu(), FLASH_Queue_exec_parallel(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), FLASH_Queue_update_gpu(), FLASH_Queue_verbose_output(), FLASH_Task_free_parallel(), and FLASH_Task_update_dependencies().
{
   return flash_queue_n_threads;
}
| double FLASH_Queue_get_parallel_time | ( | void | ) | 
{
   // Only return time if out of parallel region.
   if ( flash_queue_stack == 0 )
      return flash_queue_parallel_time;
   return 0.0;
}
| FLA_Bool FLASH_Queue_get_sorting | ( | void | ) | 
Referenced by FLASH_Queue_wait_enqueue(), and FLASH_Task_update_binding().
{ 
   return flash_queue_sorting;
}
| FLASH_Task* FLASH_Queue_get_tail_task | ( | void | ) | 
| double FLASH_Queue_get_total_time | ( | void | ) | 
{
   // Only return time if out of parallel region.
   if ( flash_queue_stack == 0 )
      return flash_queue_total_time;
   return 0.0;
}
Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_simulation(), and FLASH_Queue_verbose_output().
{ 
   return flash_queue_verbose;
}
| FLA_Bool FLASH_Queue_get_work_stealing | ( | void | ) | 
Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), and FLASH_Task_update_dependencies().
{
   return flash_queue_work_stealing;
}
| void FLASH_Queue_init | ( | void | ) | 
References FLASH_Queue_init_gpu(), and FLASH_Queue_reset().
Referenced by FLA_Init().
{
   // Exit early if we're already initialized.
   if ( flash_queue_initialized == TRUE )
      return;
   
   // Reset all the initial values.
   FLASH_Queue_reset();
   // Set the initialized flag.
   flash_queue_initialized = TRUE;
#ifdef FLA_ENABLE_GPU
   FLASH_Queue_init_gpu();
#endif
   return;
}
| void FLASH_Queue_push | ( | void * | func, | 
| void * | cntl, | ||
| char * | name, | ||
| FLA_Bool | enabled_gpu, | ||
| int | n_int_args, | ||
| int | n_fla_args, | ||
| int | n_input_args, | ||
| int | n_output_args, | ||
| ... | |||
| ) | 
References FLA_Obj_view::base, FLASH_Task_s::fla_arg, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_push_input(), FLASH_Queue_push_output(), FLASH_Task_alloc(), FLASH_Queue_s::head, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_macro_args, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_task, FLASH_Task_s::order, FLASH_Task_s::output_arg, FLASH_Task_s::prev_task, FLASH_Task_s::queue, FLASH_Queue_s::tail, and FLA_Obj_struct::write_task.
{
   int         i;
   va_list     var_arg_list;
   FLASH_Task* t;
   FLA_Obj     obj;
   // Allocate a new FLA_Task and populate its fields with appropriate values.
   t = FLASH_Task_alloc( func, cntl, name, enabled_gpu,
                         n_int_args, n_fla_args,
                         n_input_args, n_output_args );
   
   // Initialize variable argument environment. In case you're wondering, the
   // second argument in this macro invocation of va_start() is supposed to be
   // the parameter that immediately preceeds the variable argument list
   // (ie: the ... above ).
   va_start( var_arg_list, n_output_args );
   // Extract the integer arguments.
   for ( i = 0; i < n_int_args; i++ )
      t->int_arg[i] = va_arg( var_arg_list, int );
   
   // Extract the FLA_Obj arguments.
   for ( i = 0; i < n_fla_args; i++ )
      t->fla_arg[i] = va_arg( var_arg_list, FLA_Obj );
   // Extract the input FLA_Obj arguments.
   for ( i = 0; i < n_input_args; i++ )
   {
      obj = va_arg( var_arg_list, FLA_Obj );
      t->input_arg[i] = obj;
      // Macroblock is used.
      if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
      {
         dim_t    jj, kk;
         dim_t    m    = FLA_Obj_length( obj );
         dim_t    n    = FLA_Obj_width( obj );
         dim_t    cs   = FLA_Obj_col_stride( obj );
         FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
         // Dependence analysis for each input block in macroblock.
         for ( jj = 0; jj < n; jj++ )
            for ( kk = 0; kk < m; kk++ )
               FLASH_Queue_push_input( *( buf + jj * cs + kk ), t );
         // Set the number of blocks in the macroblock subtracted by one
         // since we do not want to recount an operand for each n_input_arg.
         t->n_macro_args += m * n - 1;
      }      
      else // Regular block.
      {
         // Dependence analysis for input operand.
         FLASH_Queue_push_input( obj, t );
      }
   }
   // Extract the output FLA_Obj arguments.
   for ( i = 0; i < n_output_args; i++ )
   {
      obj = va_arg( var_arg_list, FLA_Obj );
      t->output_arg[i] = obj;
      // Only assign data affinity to the first output block.
      if ( i == 0 )
      {
         FLA_Obj buf = obj;
         // Use the top left block of the macroblock.
         if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
            buf = *FLASH_OBJ_PTR_AT( obj );
         
         if ( buf.base->write_task == NULL )
            t->queue = flash_queue_n_write_blocks;
         else 
            t->queue = buf.base->write_task->queue;
      }
      // Macroblock is used.
      if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
      {
         dim_t    jj, kk;
         dim_t    m    = FLA_Obj_length( obj );
         dim_t    n    = FLA_Obj_width( obj );
         dim_t    cs   = FLA_Obj_col_stride( obj );
         FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
         // Dependence analysis for each output block in macroblock.
         for ( jj = 0; jj < n; jj++ )
            for ( kk = 0; kk < m; kk++ )
               FLASH_Queue_push_output( *( buf + jj * cs + kk ), t );
         // Set the number of blocks in the macroblock subtracted by one
         // since we do not want to recount an operand for each n_output_arg.
         t->n_macro_args += m * n - 1;
      }      
      else // Regular block.
      {
         // Dependence analysis for output operand.
         FLASH_Queue_push_output( obj, t );
      }
   }      
   // Finalize the variable argument environment.
   va_end( var_arg_list );
  
   // Add the task to the tail of the queue (and the head if queue is empty).
   if ( _tq.n_tasks == 0 )
   {
      _tq.head = t;
      _tq.tail = t;
   }
   else
   {
      t->prev_task = _tq.tail;
      _tq.tail->next_task = t;
      _tq.tail            = t;
      // Determine the index of the task in the task queue.
      t->order = t->prev_task->order + 1;
   }
   
   // Increment the number of tasks.
   _tq.n_tasks++;
   return;
}
| void FLASH_Queue_push_input | ( | FLA_Obj | obj, | 
| FLASH_Task * | t | ||
| ) | 
References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::dep_arg_tail, FLA_malloc(), FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Task_s::n_ready, FLASH_Dep_s::next_dep, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Dep_s::task, and FLA_Obj_struct::write_task.
Referenced by FLASH_Queue_push().
{
   FLASH_Task* task;
   FLASH_Dep*  d;
   // Find dependence information.
   if ( obj.base->write_task == NULL )
   {
      t->n_ready--;
      
      // Add to number of blocks read if not written and not read before.
      if ( obj.base->n_read_tasks == 0 )
      {
         // Identify each read block with an id for freeing.
         obj.base->n_read_blocks = flash_queue_n_read_blocks;
         
         flash_queue_n_read_blocks++;            
      }
   }
   else
   { // Flow dependence.
      task = obj.base->write_task;
      
      d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
      
      d->task     = t;
      d->next_dep = NULL;
      
      if ( task->n_dep_args == 0 )
      {
         task->dep_arg_head = d;
         task->dep_arg_tail = d;
      }
      else
      {
         task->dep_arg_tail->next_dep = d;
         task->dep_arg_tail           = d;
      }
      
      task->n_dep_args++;
   }
   
   // Add task to the read task in the object if not already there.
   if ( obj.base->n_read_tasks == 0 ||
        obj.base->read_task_tail->task != t )
   { // Anti-dependence potentially.
      d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
      
      d->task     = t;
      d->next_dep = NULL;
      
      if ( obj.base->n_read_tasks == 0 )
      {
         obj.base->read_task_head = d;
         obj.base->read_task_tail = d;
      }
      else
      {
         obj.base->read_task_tail->next_dep = d;
         obj.base->read_task_tail           = d;
      }
      
      obj.base->n_read_tasks++;
   }      
   
   return;
}
| void FLASH_Queue_push_output | ( | FLA_Obj | obj, | 
| FLASH_Task * | t | ||
| ) | 
References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::dep_arg_tail, FLA_free(), FLA_malloc(), FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Task_s::n_ready, FLASH_Task_s::n_war_args, FLA_Obj_struct::n_write_blocks, FLASH_Dep_s::next_dep, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Dep_s::task, and FLA_Obj_struct::write_task.
Referenced by FLASH_Queue_push().
{
   int         i;
   FLASH_Task* task;
   FLASH_Dep*  d;
   FLASH_Dep*  next_dep;
   // Assign tasks to threads with data affinity.
   if ( obj.base->write_task == NULL )
   {
      t->n_ready--;
      
      // Save index in which this output block is first encountered.
      obj.base->n_write_blocks = flash_queue_n_write_blocks;
      
      // Number of blocks written if not written before.
      flash_queue_n_write_blocks++;
      
      // Add to number of blocks read if not written or read before.
      if ( obj.base->n_read_tasks == 0 )
      {
         // Identify each read block with an id for freeing.
         obj.base->n_read_blocks = flash_queue_n_read_blocks;
         
         flash_queue_n_read_blocks++;
      }
   }
   else
   { // Flow dependence potentially.
      // The last task to overwrite this block is not itself.
      if ( obj.base->write_task != t )
      {
         // Create dependency from task that last wrote the block.
         task = obj.base->write_task;
         
         d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
         
         d->task     = t;
         d->next_dep = NULL;
         
         if ( task->n_dep_args == 0 )
         {
            task->dep_arg_head = d;
            task->dep_arg_tail = d;
         }
         else
         {
            task->dep_arg_tail->next_dep = d;
            task->dep_arg_tail           = d;
         }
         
         task->n_dep_args++;
      }
      else
      {
         // No need to notify task twice for output block already seen.
         t->n_ready--;
      }
   }
   
   // Clear read task for next set of reads and record the anti-dependence.
   d = obj.base->read_task_head;
   
   for ( i = 0; i < obj.base->n_read_tasks; i++ )
   {
      task     = d->task;
      next_dep = d->next_dep;
      
      // If the last task to read is not the current task, add dependence.
      if ( task != t )
      {
         d->task     = t;
         d->next_dep = NULL;
         
         if ( task->n_dep_args == 0 )
         {
            task->dep_arg_head = d;
            task->dep_arg_tail = d;
         }
         else
         {
            task->dep_arg_tail->next_dep = d;
            task->dep_arg_tail           = d;
         }
         
         task->n_dep_args++;
         
         t->n_war_args++;
      }  
      else
      {
         FLA_free( d );
      }
      
      d = next_dep;
   }
   
   obj.base->n_read_tasks   = 0;
   obj.base->read_task_head = NULL;
   obj.base->read_task_tail = NULL;
   
   // Record this task as the last to write to this block.
   obj.base->write_task = t;
   
   return;
}
| void FLASH_Queue_reset | ( | void | ) | 
References FLASH_Queue_s::head, FLASH_Queue_s::n_tasks, and FLASH_Queue_s::tail.
Referenced by FLASH_Queue_exec(), and FLASH_Queue_init().
| void FLASH_Queue_set_block_size | ( | dim_t | size | ) | 
Referenced by FLASH_Obj_create_hierarchy().
{
   // Only adjust the block size if the new block is larger.
   if ( flash_queue_block_size < size )
      flash_queue_block_size = size;
   return;
}
| void FLASH_Queue_set_cache_line_size | ( | dim_t | size | ) | 
{
   flash_queue_cache_line_size = size;
   return;
}
| void FLASH_Queue_set_cache_size | ( | dim_t | size | ) | 
{
   flash_queue_cache_size = size;
   return;
}
| void FLASH_Queue_set_caching | ( | FLA_Bool | caching | ) | 
Referenced by FLASH_Queue_exec().
{ 
   flash_queue_caching = caching; 
   return;
}
| void FLASH_Queue_set_cores_per_cache | ( | int | cores | ) | 
{
   flash_queue_cores_per_cache = cores;
   return;
}
| void FLASH_Queue_set_cores_per_queue | ( | int | cores | ) | 
{
   flash_queue_cores_per_queue = cores;
   return;
}
| void FLASH_Queue_set_data_affinity | ( | FLASH_Data_aff | data_affinity | ) | 
Referenced by FLASH_Queue_exec().
{ 
   flash_queue_data_affinity = data_affinity; 
   return;
}
| void FLASH_Queue_set_num_threads | ( | unsigned int | n_threads | ) | 
References FLA_Check_num_threads().
{
   FLA_Error e_val;
   // Verify that the number of threads is positive. 
   e_val = FLA_Check_num_threads( n_threads );
   FLA_Check_error_code( e_val );
   // Keep track of the number of threads internally.
   flash_queue_n_threads = n_threads;
#if   FLA_MULTITHREADING_MODEL == FLA_OPENMP
   // No additional action is necessary to set the number of OpenMP threads
   // since setting the number of threads is handled at the parallel for loop
   // with a num_threads() clause. This gives the user more flexibility since
   // he can use the OMP_NUM_THREADS environment variable or the
   // omp_set_num_threads() function to set the global number of OpenMP threads
   // independently of the number of SuperMatrix threads.
   
#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
   // No additional action is necessary to set the number of pthreads
   // since setting the number of threads is handled entirely on our end.
#endif
   return;
}
| void FLASH_Queue_set_parallel_time | ( | double | dtime | ) | 
Referenced by FLASH_Queue_exec().
{
   flash_queue_parallel_time = dtime;
   return;
}
| void FLASH_Queue_set_sorting | ( | FLA_Bool | sorting | ) | 
{ 
   flash_queue_sorting = sorting; 
   return;
}
| void FLASH_Queue_set_verbose_output | ( | FLASH_Verbose | verbose | ) | 
{ 
   flash_queue_verbose = verbose;
   return;
}
| void FLASH_Queue_set_work_stealing | ( | FLA_Bool | work_stealing | ) | 
Referenced by FLASH_Queue_exec().
{
   flash_queue_work_stealing = work_stealing;
   return;
}
| unsigned int FLASH_Queue_stack_depth | ( | void | ) | 
Referenced by FLASH_Eig_gest(), FLASH_LU_incpiv(), FLASH_QR_UT_inc(), FLASH_Queue_disable_gpu(), and FLASH_Queue_enable_gpu().
{
   return flash_queue_stack;
}
| void FLASH_Queue_verbose_output | ( | void | ) | 
References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Queue_get_data_affinity(), FLASH_Queue_get_head_task(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLA_Obj_struct::id, FLASH_Task_s::input_arg, FLA_Obj_struct::m_index, FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_index, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::name, FLASH_Dep_s::next_dep, FLASH_Task_s::next_task, FLASH_Task_s::order, FLASH_Task_s::output_arg, FLASH_Task_s::queue, and FLASH_Dep_s::task.
Referenced by FLASH_Queue_exec().
{
   int           i, j, k;
   int           n_threads = FLASH_Queue_get_num_threads();
   int           n_tasks   = FLASH_Queue_get_num_tasks();
   FLASH_Verbose verbose   = FLASH_Queue_get_verbose_output();
   FLASH_Task*   t;
   FLASH_Dep*    d;
   // Grab the head of the task queue.
   t = FLASH_Queue_get_head_task();
   
   if ( verbose == FLASH_QUEUE_VERBOSE_READABLE )
   {
      // Iterate over linked list of tasks.
      for ( i = 0; i < n_tasks; i++ )
      {
         printf( "%d\t%s\t", t->order, t->name );
         
         for ( j = 0; j < t->n_output_args; j++ )
            printf( "%lu[%lu,%lu] ", t->output_arg[j].base->id,
                    t->output_arg[j].base->m_index, 
                    t->output_arg[j].base->n_index );
         
         printf( ":= " );
         
         for ( j = 0; j < t->n_output_args; j++ )
            printf( "%lu[%lu,%lu] ", t->output_arg[j].base->id,
                    t->output_arg[j].base->m_index, 
                    t->output_arg[j].base->n_index );
         
         for ( j = 0; j < t->n_input_args; j++ )
            printf( "%lu[%lu,%lu] ", t->input_arg[j].base->id,
                    t->input_arg[j].base->m_index, 
                    t->input_arg[j].base->n_index );
         printf( "\n" );
         
         // Go to the next task.
         t = t->next_task;
      }
      printf( "\n" );
   }
   else
   {
      printf( "digraph SuperMatrix {\n" );
      if ( FLASH_Queue_get_data_affinity() == FLASH_QUEUE_AFFINITY_NONE )
      {
         // Iterate over linked list of tasks.
         for ( i = 0; i < n_tasks; i++ )
         {
            printf( "%d [label=\"%s\"]; %d -> {", t->order, t->name, t->order);
            
            d = t->dep_arg_head;
            for ( j = 0; j < t->n_dep_args; j++ )
            {
               printf( "%d;", d->task->order );
               d = d->next_dep;
            }
            printf( "};\n" );
            
            // Go to the next task.
            t = t->next_task;
         }
      }
      else
      {
         // Iterate over all the threads.
         for ( k = 0; k < n_threads; k++ )
         {
            printf( "subgraph cluster%d {\nlabel=\"%d\"\n", k, k );
            // Iterate over linked list of tasks.
            for ( i = 0; i < n_tasks; i++ )
            {  
               if ( t->queue == k )
                  printf( "%d [label=\"%s\"];\n", t->order, t->name );
               
               // Go to the next task.
               t = t->next_task;               
            }
            printf( "}\n" );
            // Grab the head of the task queue.
            t = FLASH_Queue_get_head_task();               
         }
         // Iterate over linked list of tasks.
         for ( i = 0; i < n_tasks; i++ )
         {
            printf( "%d -> {", t->order );
            
            d = t->dep_arg_head;
            for ( j = 0; j < t->n_dep_args; j++ )
            {
               printf( "%d;", d->task->order );
               d = d->next_dep;
            }
            printf( "};\n" );
            
            // Go to the next task.
            t = t->next_task;
         }
      }
      printf( "}\n\n" );
   }
   
   return;
}
| FLASH_Task* FLASH_Task_alloc | ( | void * | func, | 
| void * | cntl, | ||
| char * | name, | ||
| FLA_Bool | enabled_gpu, | ||
| int | n_int_args, | ||
| int | n_fla_args, | ||
| int | n_input_args, | ||
| int | n_output_args | ||
| ) | 
References FLASH_Task_s::cache, FLASH_Task_s::cntl, FLASH_Task_s::dep_arg_head, FLASH_Task_s::dep_arg_tail, FLASH_Task_s::enabled_gpu, FLASH_Task_s::fla_arg, FLA_malloc(), FLASH_Task_s::func, FLASH_Task_s::height, FLASH_Task_s::hit, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_fla_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_int_args, FLASH_Task_s::n_macro_args, FLASH_Task_s::n_output_args, FLASH_Task_s::n_ready, FLASH_Task_s::n_war_args, FLASH_Task_s::name, FLASH_Task_s::next_task, FLASH_Task_s::next_wait, FLASH_Task_s::order, FLASH_Task_s::output_arg, FLASH_Task_s::prev_task, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_push().
{
   FLASH_Task* t;
   // Allocate space for the task structure t.
   t             = (FLASH_Task *) FLA_malloc( sizeof(FLASH_Task) );
   // Allocate space for the task's integer arguments.
   t->int_arg    = (int *) FLA_malloc( n_int_args * sizeof(int) );
   // Allocate space for the task's FLA_Obj arguments.
   t->fla_arg    = (FLA_Obj *) FLA_malloc( n_fla_args * sizeof(FLA_Obj) );
   // Allocate space for the task's input FLA_Obj arguments.
   t->input_arg  = (FLA_Obj *) FLA_malloc( n_input_args * sizeof(FLA_Obj) );
   // Allocate space for the task's output FLA_Obj arguments.
   t->output_arg = (FLA_Obj *) FLA_malloc( n_output_args * sizeof(FLA_Obj) );
   
   // Initialize other fields of the structure.
   t->n_ready       = 0;
   t->order         = 0;
   t->queue         = 0;
   t->height        = 0;
   t->thread        = 0;
   t->cache         = 0;
   t->hit           = FALSE;
   t->func          = func;
   t->cntl          = cntl;
   t->name          = name;
   t->enabled_gpu   = enabled_gpu;
   t->n_int_args    = n_int_args;
   t->n_fla_args    = n_fla_args;
   t->n_input_args  = n_input_args;
   t->n_output_args = n_output_args;
   t->n_macro_args  = 0;
   t->n_war_args    = 0;
   t->n_dep_args    = 0;
   t->dep_arg_head  = NULL;
   t->dep_arg_tail  = NULL;
   t->prev_task     = NULL;
   t->next_task     = NULL;
   t->prev_wait     = NULL;
   t->next_wait     = NULL;
   
   // Return a pointer to the initialized structure.
   return t;
}
| void FLASH_Task_free | ( | FLASH_Task * | t | ) | 
References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::fla_arg, FLA_free(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_struct::n_read_tasks, FLASH_Dep_s::next_dep, FLASH_Task_s::output_arg, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, and FLA_Obj_struct::write_task.
Referenced by FLASH_Queue_exec(), and FLASH_Queue_exec_simulation().
{
   int        i, j, k;
   FLA_Obj    obj;
   FLASH_Dep* d;
   FLASH_Dep* next_dep;
   // Clearing the last write task in each output block.
   for ( i = 0; i < t->n_output_args; i++ )
   {
      obj = t->output_arg[i];
      
      // Macroblock is used.
      if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
      {
         dim_t    jj, kk;
         dim_t    m    = FLA_Obj_length( obj );
         dim_t    n    = FLA_Obj_width( obj );
         dim_t    cs   = FLA_Obj_col_stride( obj );
         FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
         
         // Clear each block in macroblock.
         for ( jj = 0; jj < n; jj++ )
            for ( kk = 0; kk < m; kk++ )
               ( buf + jj * cs + kk )->base->write_task = NULL;
      }
      else // Clear regular block.
      {
         obj.base->write_task = NULL;
      }
   }
   
   // Cleaning the last read tasks in each input block.
   for ( i = 0; i < t->n_input_args; i++ )
   {
      obj = t->input_arg[i];
      // Macroblock is used.
      if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
      {
         dim_t    jj, kk;
         dim_t    m    = FLA_Obj_length( obj );
         dim_t    n    = FLA_Obj_width( obj );
         dim_t    cs   = FLA_Obj_col_stride( obj );
         FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
         // Clear each block in macroblock.
         for ( jj = 0; jj < n; jj++ )
         {
            for ( kk = 0; kk < m; kk++ )
            {
               obj = *( buf + jj * cs + kk );
               
               k = obj.base->n_read_tasks;
               d = obj.base->read_task_head;
               obj.base->n_read_tasks   = 0;
               obj.base->read_task_head = NULL;
               obj.base->read_task_tail = NULL;
               for ( j = 0; j < k; j++ )
               {
                  next_dep = d->next_dep;
                  FLA_free( d );
                  d = next_dep;
               }
            }
         }
      }      
      else // Regular block.
      {     
         k = obj.base->n_read_tasks;
         d = obj.base->read_task_head;
         
         obj.base->n_read_tasks   = 0;
         obj.base->read_task_head = NULL;
         obj.base->read_task_tail = NULL;
         
         for ( j = 0; j < k; j++ )
         {
            next_dep = d->next_dep;
            FLA_free( d );
            d = next_dep;
         }
      }
   }
   // Free the dep_arg field of t.
   d = t->dep_arg_head;
   for ( i = 0; i < t->n_dep_args; i++ )
   {
      next_dep = d->next_dep;
      FLA_free( d );
      d = next_dep;
   }   
   // Free the int_arg field of t.
   FLA_free( t->int_arg );
   
   // Free the fla_arg field of t.
   FLA_free( t->fla_arg );
   // Free the input_arg field of t.
   FLA_free( t->input_arg );
   // Free the output_arg field of t.
   FLA_free( t->output_arg );
   // Finally, free the struct itself.
   FLA_free( t );
   return;
}
 1.7.6.1