libflame  revision_anchor
Functions
FLASH_Queue_main_prototypes.h File Reference

(r)

Go to the source code of this file.

Functions

void FLASH_Queue_begin (void)
 
void FLASH_Queue_end (void)
 
unsigned int FLASH_Queue_stack_depth (void)
 
FLA_Error FLASH_Queue_enable (void)
 
FLA_Error FLASH_Queue_disable (void)
 
FLA_Bool FLASH_Queue_get_enabled (void)
 
void FLASH_Queue_set_num_threads (unsigned int n_threads)
 
unsigned int FLASH_Queue_get_num_threads (void)
 
void FLASH_Queue_init (void)
 
void FLASH_Queue_finalize (void)
 
unsigned int FLASH_Queue_get_num_tasks (void)
 
void FLASH_Queue_set_verbose_output (FLASH_Verbose verbose)
 
FLASH_Verbose FLASH_Queue_get_verbose_output (void)
 
void FLASH_Queue_set_sorting (FLA_Bool sorting)
 
FLA_Bool FLASH_Queue_get_sorting (void)
 
void FLASH_Queue_set_caching (FLA_Bool caching)
 
FLA_Bool FLASH_Queue_get_caching (void)
 
void FLASH_Queue_set_work_stealing (FLA_Bool work_stealing)
 
FLA_Bool FLASH_Queue_get_work_stealing (void)
 
void FLASH_Queue_set_data_affinity (FLASH_Data_aff data_affinity)
 
FLASH_Data_aff FLASH_Queue_get_data_affinity (void)
 
double FLASH_Queue_get_total_time (void)
 
double FLASH_Queue_get_parallel_time (void)
 
void FLASH_Queue_exec (void)
 
void FLASH_Queue_set_parallel_time (double dtime)
 
void FLASH_Queue_set_block_size (dim_t size)
 
dim_t FLASH_Queue_get_block_size (void)
 
void FLASH_Queue_set_cache_size (dim_t size)
 
dim_t FLASH_Queue_get_cache_size (void)
 
void FLASH_Queue_set_cache_line_size (dim_t size)
 
dim_t FLASH_Queue_get_cache_line_size (void)
 
void FLASH_Queue_set_cores_per_cache (int cores)
 
int FLASH_Queue_get_cores_per_cache (void)
 
void FLASH_Queue_set_cores_per_queue (int cores)
 
int FLASH_Queue_get_cores_per_queue (void)
 
void FLASH_Queue_reset (void)
 
FLASH_TaskFLASH_Queue_get_head_task (void)
 
FLASH_TaskFLASH_Queue_get_tail_task (void)
 
void FLASH_Queue_push (void *func, void *cntl, char *name, FLA_Bool enabled_gpu, int n_int_args, int n_fla_args, int n_input_args, int n_output_args,...)
 
void FLASH_Queue_push_input (FLA_Obj obj, FLASH_Task *t)
 
void FLASH_Queue_push_output (FLA_Obj obj, FLASH_Task *t)
 
FLASH_TaskFLASH_Task_alloc (void *func, void *cntl, char *name, FLA_Bool enabled_gpu, int n_int_args, int n_fla_args, int n_input_args, int n_output_args)
 
void FLASH_Task_free (FLASH_Task *t)
 
void FLASH_Queue_exec_task (FLASH_Task *t)
 
void FLASH_Queue_verbose_output (void)
 
void FLASH_Queue_init_tasks (void *arg)
 
void FLASH_Queue_wait_enqueue (FLASH_Task *t, void *arg)
 
FLASH_TaskFLASH_Queue_wait_dequeue (int queue, int cache, void *arg)
 
FLASH_TaskFLASH_Queue_wait_dequeue_block (int queue, int cache, void *arg)
 
void FLASH_Queue_update_cache (FLASH_Task *t, void *arg)
 
void FLASH_Queue_update_cache_block (FLA_Obj obj, int cache, FLA_Bool output, void *arg)
 
void FLASH_Queue_prefetch (int cache, void *arg)
 
void FLASH_Queue_prefetch_block (FLA_Obj obj)
 
FLASH_TaskFLASH_Queue_work_stealing (int queue, void *arg)
 
void FLASH_Queue_create_gpu (int thread, void *arg)
 
void FLASH_Queue_destroy_gpu (int thread, void *arg)
 
FLA_Bool FLASH_Queue_exec_gpu (FLASH_Task *t, void *arg)
 
FLA_Bool FLASH_Queue_check_gpu (FLASH_Task *t, void *arg)
 
FLA_Bool FLASH_Queue_check_block_gpu (FLA_Obj obj, int thread, void *arg)
 
void FLASH_Queue_update_gpu (FLASH_Task *t, void **input_arg, void **output_arg, void *arg)
 
void FLASH_Queue_update_block_gpu (FLA_Obj obj, void **buffer_gpu, int thread, void *arg)
 
void FLASH_Queue_mark_gpu (FLASH_Task *t, void *arg)
 
void FLASH_Queue_invalidate_block_gpu (FLA_Obj obj, int thread, void *arg)
 
void FLASH_Queue_flush_block_gpu (FLA_Obj obj, int thread, void *arg)
 
void FLASH_Queue_flush_gpu (int thread, void *arg)
 
void FLASH_Queue_exec_parallel (void *arg)
 
void * FLASH_Queue_exec_parallel_function (void *arg)
 
FLASH_TaskFLASH_Task_update_dependencies (FLASH_Task *t, void *arg)
 
FLASH_TaskFLASH_Task_update_binding (FLASH_Task *t, FLASH_Task *r, void *arg)
 
void FLASH_Task_free_parallel (FLASH_Task *t, void *arg)
 
void FLASH_Queue_exec_simulation (void *arg)
 

Function Documentation

◆ FLASH_Queue_begin()

void FLASH_Queue_begin ( void  )

Referenced by FLASH_Apply_CAQ_UT_inc(), FLASH_Apply_Q2_UT(), FLASH_Apply_Q_UT(), FLASH_Apply_Q_UT_inc(), FLASH_Apply_QUD_UT_inc(), FLASH_CAQR_UT_inc_noopt(), FLASH_Chol(), FLASH_Copy(), FLASH_Copyr(), FLASH_Eig_gest(), FLASH_Gemm(), FLASH_Hemm(), FLASH_Her2k(), FLASH_Herk(), FLASH_LQ_UT(), FLASH_LU_incpiv_noopt(), FLASH_LU_incpiv_opt1(), FLASH_LU_nopiv(), FLASH_LU_piv(), FLASH_Lyap(), FLASH_QR2_UT(), FLASH_QR_UT(), FLASH_QR_UT_inc_noopt(), FLASH_QR_UT_inc_opt1(), FLASH_SPDinv(), FLASH_Sylv(), FLASH_Symm(), FLASH_Syr2k(), FLASH_Syrk(), FLASH_Trinv(), FLASH_Trmm(), FLASH_Trsm(), FLASH_Ttmm(), and FLASH_UDdate_UT_inc().

65 {
66 #ifdef FLA_ENABLE_SUPERMATRIX
67  if ( flash_queue_stack == 0 )
68  {
69  // Save the starting time for the total execution time.
70  flash_queue_total_time = FLA_Clock();
71  }
72 #endif
73 
74  // Push onto the stack.
75  flash_queue_stack++;
76 
77  return;
78 }
double FLA_Clock(void)
Definition: FLA_Clock.c:20

◆ FLASH_Queue_check_block_gpu()

FLA_Bool FLASH_Queue_check_block_gpu ( FLA_Obj  obj,
int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.

Referenced by FLASH_Queue_check_gpu().

1552 {
1553  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1554  int k;
1555  dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1556  FLA_Bool r_val = TRUE;
1557 
1558 #ifdef FLA_ENABLE_MULTITHREADING
1559  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1560 #endif
1561 
1562  // Locate the position of the block on the GPU.
1563  for ( k = 0; k < gpu_n_blocks; k++ )
1564  if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1565  break;
1566 
1567  if ( k < gpu_n_blocks )
1568  {
1569  // Request this block if it is dirty.
1570  if ( !args->gpu[thread * gpu_n_blocks + k].clean )
1571  {
1572  args->gpu[thread * gpu_n_blocks + k].request = TRUE;
1573 
1574  r_val = FALSE;
1575  }
1576  }
1577 
1578  // Check the victim block.
1579  if ( obj.base == args->victim[thread].obj.base )
1580  r_val = FALSE;
1581 
1582 #ifdef FLA_ENABLE_MULTITHREADING
1583  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1584 #endif
1585 
1586  return r_val;
1587 }
FLA_Bool request
Definition: FLASH_Queue_exec.c:49
unsigned long dim_t
Definition: FLA_type_defs.h:71
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
FLA_Obj_gpu * victim
Definition: FLASH_Queue_exec.c:107
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLA_Bool clean
Definition: FLASH_Queue_exec.c:46
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
Definition: FLASH_Queue_exec.c:54
int FLA_Bool
Definition: FLA_type_defs.h:46
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
FLA_Lock * gpu_lock
Definition: FLASH_Queue_exec.c:101
FLA_Obj obj
Definition: FLASH_Queue_exec.c:40

◆ FLASH_Queue_check_gpu()

FLA_Bool FLASH_Queue_check_gpu ( FLASH_Task t,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_block_gpu(), FLASH_Queue_get_num_threads(), i, FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec_gpu().

1453 {
1454  int i, j, k;
1455  int thread = t->thread;
1456  int n_input_args = t->n_input_args;
1457  int n_output_args = t->n_output_args;
1458  int n_threads = FLASH_Queue_get_num_threads();
1459  FLA_Bool r_val = TRUE;
1460  FLA_Bool t_val;
1461  FLA_Bool duplicate;
1462  FLA_Obj obj;
1463 
1464  // Check the input and output arguments on the GPUs.
1465  for ( i = 0; i < n_input_args + n_output_args; i++ )
1466  {
1467  // Check for duplicate blocks.
1468  duplicate = FALSE;
1469 
1470  // Find the correct input or output argument.
1471  if ( i < n_input_args )
1472  {
1473  obj = t->input_arg[i];
1474 
1475  for ( j = 0; j < n_output_args && !duplicate; j++ )
1476  {
1477  if ( obj.base == t->output_arg[j].base )
1478  duplicate = TRUE;
1479  }
1480 
1481  for ( j = 0; j < i && !duplicate; j++ )
1482  {
1483  if ( obj.base == t->input_arg[j].base )
1484  duplicate = TRUE;
1485  }
1486  }
1487  else
1488  {
1489  obj = t->output_arg[i - n_input_args];
1490 
1491  for ( j = 0; j < i - n_input_args && !duplicate; j++ )
1492  {
1493  if ( obj.base == t->output_arg[j].base )
1494  duplicate = TRUE;
1495  }
1496  }
1497 
1498  // If the block has not been processed before.
1499  if ( !duplicate )
1500  {
1501  // Macroblock is used.
1502  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1503  {
1504  dim_t jj, kk;
1505  dim_t m = FLA_Obj_length( obj );
1506  dim_t n = FLA_Obj_width( obj );
1507  dim_t cs = FLA_Obj_col_stride( obj );
1508  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
1509 
1510  // Clear each block in macroblock.
1511  for ( jj = 0; jj < n; jj++ )
1512  {
1513  for ( kk = 0; kk < m; kk++ )
1514  {
1515  obj = *( buf + jj * cs + kk );
1516 
1517  t_val = TRUE;
1518 
1519  // Check to see if the block is dirty on another GPU.
1520  for ( k = 0; k < n_threads && t_val; k++ )
1521  if ( k != thread )
1522  t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg );
1523 
1524  r_val = r_val && t_val;
1525  }
1526  }
1527  }
1528  else
1529  {
1530  t_val = TRUE;
1531 
1532  // Check to see if the block is dirty on another GPU.
1533  for ( k = 0; k < n_threads && t_val; k++ )
1534  if ( k != thread )
1535  t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg );
1536 
1537  r_val = r_val && t_val;
1538  }
1539  }
1540  }
1541 
1542  return r_val;
1543 }
unsigned long dim_t
Definition: FLA_type_defs.h:71
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
Definition: FLA_type_defs.h:158
dim_t FLA_Obj_width(FLA_Obj obj)
Definition: FLA_Query.c:123
int n_input_args
Definition: FLA_type_defs.h:217
FLA_Bool FLASH_Queue_check_block_gpu(FLA_Obj obj, int thread, void *arg)
Definition: FLASH_Queue_exec.c:1546
int n_output_args
Definition: FLA_type_defs.h:221
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
FLA_Obj * input_arg
Definition: FLA_type_defs.h:218
int FLA_Bool
Definition: FLA_type_defs.h:46
dim_t FLA_Obj_col_stride(FLA_Obj obj)
Definition: FLA_Query.c:174
int i
Definition: bl1_axmyv2.c:145
int thread
Definition: FLA_type_defs.h:192
dim_t FLA_Obj_length(FLA_Obj obj)
Definition: FLA_Query.c:116
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223
FLA_Elemtype FLA_Obj_elemtype(FLA_Obj obj)
Definition: FLA_Query.c:51

◆ FLASH_Queue_create_gpu()

void FLASH_Queue_create_gpu ( int  thread,
void *  arg 
)

References FLASH_Queue_variables::block_size, FLA_Obj_gpu_struct::buffer_gpu, FLASH_Queue_variables::datatype, FLASH_Queue_alloc_gpu(), FLASH_Queue_bind_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, and i.

Referenced by FLASH_Queue_exec_parallel_function().

1233 {
1234  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1235  int i;
1236  dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1237  dim_t block_size = args->block_size;
1238  FLA_Datatype datatype = args->datatype;
1239 
1240  // Exit if not using GPU.
1241  if ( !FLASH_Queue_get_enabled_gpu() )
1242  return;
1243 
1244  // Bind thread to GPU.
1245  FLASH_Queue_bind_gpu( thread );
1246 
1247  // Allocate the memory on the GPU for all the blocks a priori.
1248  for ( i = 0; i < gpu_n_blocks; i++ )
1249  FLASH_Queue_alloc_gpu( block_size, datatype, &(args->gpu[thread * gpu_n_blocks + i].buffer_gpu) );
1250 
1251  return;
1252 }
unsigned long dim_t
Definition: FLA_type_defs.h:71
FLA_Datatype datatype
Definition: FLASH_Queue_exec.c:116
FLA_Bool FLASH_Queue_get_enabled_gpu(void)
Definition: FLASH_Queue_gpu.c:91
void * buffer_gpu
Definition: FLASH_Queue_exec.c:43
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLA_Error FLASH_Queue_bind_gpu(int thread)
Definition: FLASH_Queue_gpu.c:133
Definition: FLASH_Queue_exec.c:54
dim_t block_size
Definition: FLASH_Queue_exec.c:113
int FLA_Datatype
Definition: FLA_type_defs.h:49
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
FLA_Error FLASH_Queue_alloc_gpu(dim_t size, FLA_Datatype datatype, void **buffer_gpu)
Definition: FLASH_Queue_gpu.c:147
int i
Definition: bl1_axmyv2.c:145

◆ FLASH_Queue_destroy_gpu()

void FLASH_Queue_destroy_gpu ( int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLASH_Queue_free_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, i, and FLA_Obj_gpu_struct::obj.

Referenced by FLASH_Queue_exec_parallel_function().

1261 {
1262  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1263  int i;
1264  dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1265  FLA_Obj_gpu gpu_obj;
1266 
1267  // Exit if not using GPU.
1268  if ( !FLASH_Queue_get_enabled_gpu() )
1269  return;
1270 
1271  // Examine every block left on the GPU.
1272  for ( i = 0; i < gpu_n_blocks; i++ )
1273  {
1274  gpu_obj = args->gpu[thread * gpu_n_blocks + i];
1275 
1276  // Flush the blocks that are dirty.
1277  if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
1278  FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1279 
1280  // Free the memory on the GPU for all the blocks.
1281  FLASH_Queue_free_gpu( gpu_obj.buffer_gpu );
1282  }
1283 
1284  return;
1285 }
unsigned long dim_t
Definition: FLA_type_defs.h:71
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
FLA_Bool FLASH_Queue_get_enabled_gpu(void)
Definition: FLASH_Queue_gpu.c:91
FLA_Error FLASH_Queue_free_gpu(void *buffer_gpu)
Definition: FLASH_Queue_gpu.c:171
void * buffer_gpu
Definition: FLASH_Queue_exec.c:43
FLA_Error FLASH_Queue_read_gpu(FLA_Obj obj, void *buffer_gpu)
Definition: FLASH_Queue_gpu.c:205
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLA_Bool clean
Definition: FLASH_Queue_exec.c:46
Definition: FLASH_Queue_exec.c:54
Definition: FLASH_Queue_exec.c:37
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
int i
Definition: bl1_axmyv2.c:145
FLA_Obj obj
Definition: FLASH_Queue_exec.c:40

◆ FLASH_Queue_disable()

FLA_Error FLASH_Queue_disable ( void  )

Referenced by FLASH_Apply_pivots(), FLASH_Axpy(), FLASH_Axpyt(), FLASH_Copyt(), FLASH_FS_incpiv(), FLASH_Gemv(), FLASH_Scal(), FLASH_Scalr(), and FLASH_Trsv().

150 {
151 #ifdef FLA_ENABLE_SUPERMATRIX
152  if ( flash_queue_stack == 0 )
153  {
154  // Disable if not begin parallel region yet.
155  flash_queue_enabled = FALSE;
156  return FLA_SUCCESS;
157  }
158  else
159  {
160  // Cannot change status during parallel region.
161  return FLA_FAILURE;
162  }
163 #else
164  // Allow disabling enqueuing even when SuperMatrix is not configured.
165  flash_queue_enabled = FALSE;
166  return FLA_SUCCESS;
167 #endif
168 }

◆ FLASH_Queue_enable()

FLA_Error FLASH_Queue_enable ( void  )

Referenced by FLASH_Apply_pivots(), FLASH_Axpy(), FLASH_Axpyt(), FLASH_Copyt(), FLASH_FS_incpiv(), FLASH_Gemv(), FLASH_Scal(), FLASH_Scalr(), and FLASH_Trsv().

123 {
124 #ifdef FLA_ENABLE_SUPERMATRIX
125  if ( flash_queue_stack == 0 )
126  {
127  // Enable if not begin parallel region yet.
128  flash_queue_enabled = TRUE;
129  return FLA_SUCCESS;
130  }
131  else
132  {
133  // Cannot change status during parallel region.
134  return FLA_FAILURE;
135  }
136 #else
137  // Raise an exception when SuperMatrix is not configured.
138  FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED );
139  return FLA_FAILURE;
140 #endif
141 }

◆ FLASH_Queue_end()

void FLASH_Queue_end ( void  )

Referenced by FLASH_Apply_CAQ_UT_inc(), FLASH_Apply_Q2_UT(), FLASH_Apply_Q_UT(), FLASH_Apply_Q_UT_inc(), FLASH_Apply_QUD_UT_inc(), FLASH_CAQR_UT_inc_noopt(), FLASH_Chol(), FLASH_Copy(), FLASH_Copyr(), FLASH_Eig_gest(), FLASH_Gemm(), FLASH_Hemm(), FLASH_Her2k(), FLASH_Herk(), FLASH_LQ_UT(), FLASH_LU_incpiv_noopt(), FLASH_LU_incpiv_opt1(), FLASH_LU_nopiv(), FLASH_LU_piv(), FLASH_Lyap(), FLASH_QR2_UT(), FLASH_QR_UT(), FLASH_QR_UT_inc_noopt(), FLASH_QR_UT_inc_opt1(), FLASH_SPDinv(), FLASH_Sylv(), FLASH_Symm(), FLASH_Syr2k(), FLASH_Syrk(), FLASH_Trinv(), FLASH_Trmm(), FLASH_Trsm(), FLASH_Ttmm(), and FLASH_UDdate_UT_inc().

87 {
88  // Pop off the stack.
89  flash_queue_stack--;
90 
91 #ifdef FLA_ENABLE_SUPERMATRIX
92  if ( flash_queue_stack == 0 )
93  {
94  // Execute tasks if encounter the outermost parallel region.
96 
97  // Find the total execution time.
98  flash_queue_total_time = FLA_Clock() - flash_queue_total_time;
99  }
100 #endif
101 
102  return;
103 }
void FLASH_Queue_exec(void)
Definition: FLASH_Queue_exec.c:2756
double FLA_Clock(void)
Definition: FLA_Clock.c:20

◆ FLASH_Queue_exec()

void FLASH_Queue_exec ( void  )

References FLASH_Queue_variables::all_lock, FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLASH_Queue_variables::cac_lock, FLASH_Queue_variables::cache, FLA_Obj_gpu_struct::clean, FLASH_Queue_variables::dep_lock, FLA_Clock(), FLA_free(), FLA_is_owner(), FLA_Lock_destroy(), FLA_Lock_init(), FLA_malloc(), FLA_shfree(), FLA_shmalloc(), FLASH_Queue_exec_parallel(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), FLASH_Queue_get_block_size(), FLASH_Queue_get_cache_size(), FLASH_Queue_get_caching(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_cores_per_queue(), FLASH_Queue_get_data_affinity(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLASH_Queue_get_work_stealing(), FLASH_Queue_init_tasks(), FLASH_Queue_reset(), FLASH_Queue_set_caching(), FLASH_Queue_set_data_affinity(), FLASH_Queue_set_parallel_time(), FLASH_Queue_set_work_stealing(), FLASH_Queue_verbose_output(), FLASH_Task_free(), for(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Queue_variables::gpu_log, FLASH_Queue_s::head, i, FLASH_Queue_variables::n_caches, FLASH_Queue_variables::n_queues, FLASH_Queue_variables::n_ready, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLA_Obj_gpu_struct::obj, FLASH_Queue_variables::pc, FLASH_Queue_variables::prefetch, RCCE_wtime(), FLA_Obj_gpu_struct::request, FLASH_Queue_variables::run_lock, FLASH_Queue_variables::size, Synch_all(), FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, FLASH_Queue_variables::victim, FLASH_Queue_variables::wait_queue, and FLASH_Queue_variables::war_lock.

2762 {
2763  int n_tasks = FLASH_Queue_get_num_tasks();
2764  int i;
2765  double dtime;
2766 
2767  // All the necessary variables for the SuperMatrix mechanism.
2768  FLASH_Queue_vars args;
2769 
2770  // If the queue is empty, return early.
2771  if ( n_tasks == 0 )
2772  return;
2773 
2774  // Turn off all multiple queue implementations.
2775  FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_NONE );
2777  // Do not use cache affinity yet.
2778  FLASH_Queue_set_caching( FALSE );
2779 
2780  // Allocate memory for task queues.
2781  args.task_queue = ( FLASH_Task** ) FLA_malloc( n_tasks * sizeof( FLASH_Task* ) );
2782  args.n_ready = ( int* ) FLA_shmalloc( n_tasks * sizeof( int ) );
2783  args.wait_queue = ( int* ) FLA_shmalloc( n_tasks * sizeof( int ) );
2784  args.n_wait = ( int* ) FLA_shmalloc( sizeof( int ) );
2785  args.pc = ( int* ) FLA_shmalloc( sizeof( int ) );
2786 
2787  // Initialize data.
2788  if ( FLA_is_owner() )
2789  {
2790  args.n_wait[0] = 0;
2791  args.pc[0] = 0;
2792  }
2793 
2794  Synch_all();
2795 
2796  // Initialize tasks with critical information.
2797  FLASH_Queue_init_tasks( ( void* ) &args );
2798 
2799  // Display verbose output before free all tasks.
2802 
2803  // Start timing the parallel execution.
2804  dtime = RCCE_wtime();
2805 
2806  FLASH_Queue_exec_parallel_function( ( void* ) &args );
2807 
2808  // End timing the parallel execution.
2809  dtime = RCCE_wtime() - dtime;
2811 
2812  // Free all tasks sequentially.
2813  for ( i = 0; i < n_tasks; i++ )
2814  FLASH_Task_free( args.task_queue[i] );
2815 
2816  // Free data.
2817  FLA_free( args.task_queue );
2818  FLA_shfree( args.n_ready );
2819  FLA_shfree( args.wait_queue );
2820  FLA_shfree( args.n_wait );
2821  FLA_shfree( args.pc );
2822 
2823  // Reset values for next call to FLASH_Queue_exec().
2825 
2826  return;
2827 }
void Synch_all()
int pc
Definition: FLASH_Queue_exec.c:96
int * n_ready
Definition: FLASH_Queue_exec.c:2742
void * FLA_shmalloc(size_t size)
Definition: FLA_Obj.c:21
FLASH_Task ** task_queue
Definition: FLASH_Queue_exec.c:2739
void FLASH_Queue_verbose_output(void)
Definition: FLASH_Queue.c:1782
void FLASH_Queue_reset(void)
Definition: FLASH_Queue.c:583
Definition: FLA_type_defs.h:183
void FLA_shfree(void *ptr)
Definition: FLA_Obj.c:27
void FLA_free(void *ptr)
Definition: FLA_Memory.c:247
void FLASH_Queue_init_tasks(void *arg)
Definition: FLASH_Queue_exec.c:394
FLA_Bool FLA_is_owner(void)
Definition: FLA_Obj.c:33
void FLASH_Queue_set_data_affinity(FLASH_Data_aff data_affinity)
Definition: FLASH_Queue.c:391
void * FLA_malloc(size_t size)
Definition: FLA_Memory.c:111
void FLASH_Queue_set_work_stealing(FLA_Bool work_stealing)
Definition: FLASH_Queue.c:367
void * FLASH_Queue_exec_parallel_function(void *arg)
Definition: FLASH_Queue_exec.c:2156
Definition: FLASH_Queue_exec.c:54
FLASH_Queue * wait_queue
Definition: FLASH_Queue_exec.c:92
void FLASH_Queue_set_parallel_time(double dtime)
Definition: FLASH_Queue.c:448
void FLASH_Task_free(FLASH_Task *t)
Definition: FLASH_Queue.c:1020
unsigned int FLASH_Queue_get_num_tasks(void)
Definition: FLASH_Queue.c:284
int i
Definition: bl1_axmyv2.c:145
FLASH_Verbose FLASH_Queue_get_verbose_output(void)
Definition: FLASH_Queue.c:308
int * n_wait
Definition: FLASH_Queue_exec.c:2748
void FLASH_Queue_set_caching(FLA_Bool caching)
Definition: FLASH_Queue.c:343
double RCCE_wtime(void)

◆ FLASH_Queue_exec_gpu()

FLA_Bool FLASH_Queue_exec_gpu ( FLASH_Task t,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Task_s::enabled_gpu, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_malloc(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_exec_task_gpu(), FLASH_Queue_flush_block_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_mark_gpu(), FLASH_Queue_update_gpu(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::hit, i, FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLASH_Task_s::queue, FLASH_Queue_variables::run_lock, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec_parallel_function().

1294 {
1295  void** input_arg;
1296  void** output_arg;
1297 
1298  if ( t == NULL )
1299  return TRUE;
1300 
1301  // If not using the GPU, then execute on CPU.
1302  if ( !FLASH_Queue_get_enabled_gpu() )
1303  {
1304  FLASH_Queue_exec_task( t );
1305 
1306  return TRUE;
1307  }
1308 
1309  // Check if all the operands are ready and up to date.
1310  if ( !FLASH_Queue_check_gpu( t, arg ) )
1311  {
1312  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1313  int queue = t->queue;
1314  t->hit = FALSE;
1315 
1316 #ifdef FLA_ENABLE_MULTITHREADING
1317  FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
1318 #endif
1319  // Reenqueue the task if the blocks are not all flushed.
1320  FLASH_Queue_wait_enqueue( t, arg );
1321 
1322 #ifdef FLA_ENABLE_MULTITHREADING
1323  FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
1324 #endif
1325 
1326  return FALSE;
1327  }
1328 
1329  // If GPU is enabled, but the task is not supported for GPU execution.
1330  if ( !t->enabled_gpu )
1331  {
1332  int i, j, k;
1333  int thread = t->thread;
1334  int n_input_args = t->n_input_args;
1335  int n_output_args = t->n_output_args;
1336  int n_threads = FLASH_Queue_get_num_threads();
1337  FLA_Bool duplicate;
1338  FLA_Obj obj;
1339 
1340  // Check the blocks on each GPU.
1341  for ( k = 0; k < n_threads; k++ )
1342  {
1343  // Check the input and output arguments on the GPUs.
1344  for ( i = 0; i < n_input_args + n_output_args; i++ )
1345  {
1346  // Check for duplicate blocks.
1347  duplicate = FALSE;
1348 
1349  // Find the correct input or output argument.
1350  if ( i < n_input_args )
1351  {
1352  obj = t->input_arg[i];
1353 
1354  for ( j = 0; j < n_output_args && !duplicate; j++ )
1355  {
1356  if ( obj.base == t->output_arg[j].base )
1357  duplicate = TRUE;
1358  }
1359 
1360  for ( j = 0; j < i && !duplicate; j++ )
1361  {
1362  if ( obj.base == t->input_arg[j].base )
1363  duplicate = TRUE;
1364  }
1365  }
1366  else
1367  {
1368  obj = t->output_arg[i - n_input_args];
1369 
1370  for ( j = 0; j < i - n_input_args && !duplicate; j++ )
1371  {
1372  if ( obj.base == t->output_arg[j].base )
1373  duplicate = TRUE;
1374  }
1375  }
1376 
1377  // If the block has not been processed before.
1378  if ( !duplicate )
1379  {
1380  // Macroblock is used.
1381  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1382  {
1383  dim_t jj, kk;
1384  dim_t m = FLA_Obj_length( obj );
1385  dim_t n = FLA_Obj_width( obj );
1386  dim_t cs = FLA_Obj_col_stride( obj );
1387  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
1388 
1389  // Clear each block in macroblock.
1390  for ( jj = 0; jj < n; jj++ )
1391  {
1392  for ( kk = 0; kk < m; kk++ )
1393  {
1394  obj = *( buf + jj * cs + kk );
1395 
1396  // Flush the block to main memory if it is on the GPU.
1397  if ( k == thread )
1398  FLASH_Queue_flush_block_gpu( obj, k, arg );
1399 
1400  // Invalidate output block on all GPUs.
1401  if ( i >= n_input_args )
1402  FLASH_Queue_invalidate_block_gpu( obj, k, arg );
1403  }
1404  }
1405  }
1406  else
1407  {
1408  // Flush the block to main memory if it is on the GPU.
1409  if ( k == thread )
1410  FLASH_Queue_flush_block_gpu( obj, k, arg );
1411 
1412  // Invalidate output block on all GPUs.
1413  if ( i >= n_input_args )
1414  FLASH_Queue_invalidate_block_gpu( obj, k, arg );
1415  }
1416  }
1417  }
1418  }
1419 
1420  // Execute the task on CPU instead of GPU.
1421  FLASH_Queue_exec_task( t );
1422 
1423  return TRUE;
1424  }
1425 
1426  // Gather the pointers for the data on the GPU.
1427  input_arg = ( void** ) FLA_malloc( t->n_input_args * sizeof( void* ) );
1428  output_arg = ( void** ) FLA_malloc( t->n_output_args * sizeof( void* ) );
1429 
1430  // Bring all the blocks to GPU.
1431  FLASH_Queue_update_gpu( t, input_arg, output_arg, arg );
1432 
1433  // Execute the task on GPU.
1434  FLASH_Queue_exec_task_gpu( t, input_arg, output_arg );
1435 
1436  // Mark all the output blocks as dirty.
1437  FLASH_Queue_mark_gpu( t, arg );
1438 
1439  // Free memory.
1440  FLA_free( input_arg );
1441  FLA_free( output_arg );
1442 
1443  return TRUE;
1444 }
unsigned long dim_t
Definition: FLA_type_defs.h:71
void FLASH_Queue_invalidate_block_gpu(FLA_Obj obj, int thread, void *arg)
Definition: FLASH_Queue_exec.c:1844
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
FLA_Bool FLASH_Queue_get_enabled_gpu(void)
Definition: FLASH_Queue_gpu.c:91
FLA_Bool enabled_gpu
Definition: FLA_type_defs.h:206
Definition: FLA_type_defs.h:158
dim_t FLA_Obj_width(FLA_Obj obj)
Definition: FLA_Query.c:123
void FLA_free(void *ptr)
Definition: FLA_Memory.c:247
int n_input_args
Definition: FLA_type_defs.h:217
FLA_Lock * run_lock
Definition: FLASH_Queue_exec.c:62
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
int n_output_args
Definition: FLA_type_defs.h:221
void * FLA_malloc(size_t size)
Definition: FLA_Memory.c:111
int queue
Definition: FLA_type_defs.h:190
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
void FLASH_Queue_exec_task(FLASH_Task *t)
Definition: FLASH_Queue.c:1141
FLA_Bool hit
Definition: FLA_type_defs.h:194
Definition: FLASH_Queue_exec.c:54
void FLASH_Queue_wait_enqueue(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:626
FLA_Obj * input_arg
Definition: FLA_type_defs.h:218
int FLA_Bool
Definition: FLA_type_defs.h:46
void FLASH_Queue_exec_task_gpu(FLASH_Task *t, void **input_arg, void **output_arg)
Definition: FLASH_Queue_gpu.c:225
dim_t FLA_Obj_col_stride(FLA_Obj obj)
Definition: FLA_Query.c:174
int i
Definition: bl1_axmyv2.c:145
void FLASH_Queue_flush_block_gpu(FLA_Obj obj, int thread, void *arg)
Definition: FLASH_Queue_exec.c:1893
int thread
Definition: FLA_type_defs.h:192
dim_t FLA_Obj_length(FLA_Obj obj)
Definition: FLA_Query.c:116
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223
FLA_Elemtype FLA_Obj_elemtype(FLA_Obj obj)
Definition: FLA_Query.c:51
void FLASH_Queue_update_gpu(FLASH_Task *t, void **input_arg, void **output_arg, void *arg)
Definition: FLASH_Queue_exec.c:1590
FLA_Bool FLASH_Queue_check_gpu(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:1447
void FLASH_Queue_mark_gpu(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:1787

◆ FLASH_Queue_exec_parallel()

void FLASH_Queue_exec_parallel ( void *  arg)

References FLA_Check_pthread_create_result(), FLA_Check_pthread_join_result(), FLA_free(), FLA_malloc(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_get_num_threads(), and i.

Referenced by FLASH_Queue_exec().

2049 {
2050  int i;
2051  int n_threads = FLASH_Queue_get_num_threads();
2052  void* (*thread_entry_point)( void* );
2053 
2054  // Allocate the thread structures array. Here, an array of FLASH_Thread
2055  // structures of length n_threads is allocated and the fields of each
2056  // structure set to appropriate values.
2057  FLASH_Thread* thread = ( FLASH_Thread* ) FLA_malloc( n_threads * sizeof( FLASH_Thread ) );
2058 
2059  // Initialize the thread structures array.
2060  for ( i = 0; i < n_threads; i++ )
2061  {
2062  // Save the thread's identifier.
2063  thread[i].id = i;
2064 
2065  // Save the pointer to the necessary variables with the thread.
2066  thread[i].args = arg;
2067 
2068  // The pthread object, if it was even compiled into the FLASH_Thread
2069  // structure, will be initialized by the pthread implementation when we
2070  // call pthread_create() and does not need to be touched at this time.
2071  }
2072 
2073  // Determine which function to send threads to.
2074  thread_entry_point = FLASH_Queue_exec_parallel_function;
2075 
2076 #if FLA_MULTITHREADING_MODEL == FLA_OPENMP
2077 
2078  // An OpenMP parallel for region spawns n_threads threads. Each thread
2079  // executes the work function with a different FLASH_Thread argument.
2080  // An implicit synchronization point exists at the end of the curly
2081  // brace scope.
2082  #pragma omp parallel for \
2083  private( i ) \
2084  shared( thread, n_threads, thread_entry_point ) \
2085  schedule( static, 1 ) \
2086  num_threads( n_threads )
2087  for ( i = 0; i < n_threads; ++i )
2088  {
2089  thread_entry_point( ( void* ) &thread[i] );
2090  }
2091 
2092 #elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
2093 
2094  // Create each POSIX thread needed in addition to the main thread.
2095  for ( i = 1; i < n_threads; i++ )
2096  {
2097  int pthread_e_val;
2098 
2099  // Create thread i with default attributes.
2100  pthread_e_val = pthread_create( &(thread[i].pthread_obj),
2101  NULL,
2102  thread_entry_point,
2103  ( void* ) &thread[i] );
2104 
2105 #ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
2106  FLA_Error e_val = FLA_Check_pthread_create_result( pthread_e_val );
2107  FLA_Check_error_code( e_val );
2108 #endif
2109  }
2110 
2111  // The main thread is assigned the role of thread 0. Here we manually
2112  // execute it as a worker thread.
2113  thread_entry_point( ( void* ) &thread[0] );
2114 
2115  // Wait for non-main threads to finish.
2116  for ( i = 1; i < n_threads; i++ )
2117  {
2118  // These two variables are declared local to this for loop since this
2119  // is the only place they are needed, and since they would show up as
2120  // unused variables if FLA_MULTITHREADING_MODEL == FLA_PTHREADS.
2121  // Strangely, the Intel compiler produces code that results in an
2122  // "unaligned access" runtime message if thread_status is declared as
2123  // an int. Declaring it as a long or void* appears to force the
2124  // compiler (not surprisingly) into aligning it to an 8-byte boundary.
2125  int pthread_e_val;
2126  void* thread_status;
2127 
2128  // Wait for thread i to invoke its respective pthread_exit().
2129  // The return value passed to pthread_exit() is provided to us
2130  // via status, if one was given.
2131  pthread_e_val = pthread_join( thread[i].pthread_obj,
2132  ( void** ) &thread_status );
2133 
2134 #ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
2135  FLA_Error e_val = FLA_Check_pthread_join_result( pthread_e_val );
2136  FLA_Check_error_code( e_val );
2137 #endif
2138  }
2139 
2140 #endif
2141 
2142  FLA_free( thread );
2143 
2144  return;
2145 }
int FLA_Error
Definition: FLA_type_defs.h:47
void FLA_free(void *ptr)
Definition: FLA_Memory.c:247
FLA_Error FLA_Check_pthread_join_result(int pthread_join_r_val)
Definition: FLA_Check.c:760
void * FLA_malloc(size_t size)
Definition: FLA_Memory.c:111
Definition: FLA_type_defs.h:254
void * FLASH_Queue_exec_parallel_function(void *arg)
Definition: FLASH_Queue_exec.c:2156
int i
Definition: bl1_axmyv2.c:145
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223
FLA_Error FLA_Check_pthread_create_result(int pthread_create_r_val)
Definition: FLA_Check.c:750

◆ FLASH_Queue_exec_parallel_function()

void * FLASH_Queue_exec_parallel_function ( void *  arg)

References FLASH_Thread_s::args, FLASH_Task_s::cache, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_create_gpu(), FLASH_Queue_destroy_gpu(), FLASH_Queue_exec_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_flush_gpu(), FLASH_Queue_get_caching(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_work_stealing(), FLASH_Task_free_parallel(), FLASH_Task_update_dependencies(), i, FLASH_Thread_s::id, FLASH_Queue_variables::n_queues, RCCE_acquire_lock(), RCCE_release_lock(), RCCE_ue(), and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec(), and FLASH_Queue_exec_parallel().

2988 {
2989  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2990  int i = RCCE_ue();
2991  int queue = 0;
2992  int cache = 0;
2993  int n_tasks = FLASH_Queue_get_num_tasks();
2994  int n_threads = FLASH_Queue_get_num_threads();
2995  FLA_Bool condition;
2996  FLA_Bool available;
2997  FLASH_Task* t = NULL;
2998 
2999  // Do not let extraneous cores execute.
3000  if ( i < n_threads )
3001  condition = TRUE;
3002  else
3003  condition = FALSE;
3004 
3005  // Loop until all the tasks have committed.
3006  while ( condition )
3007  {
3008  RCCE_acquire_lock( 0 );
3009 
3010  // Obtain task to execute.
3011  t = FLASH_Queue_wait_dequeue( queue, cache, ( void* ) args );
3012 
3013  RCCE_release_lock( 0 );
3014 
3015  // Dequeued a task from the waiting queue.
3016  available = ( t != NULL );
3017 
3018  if ( available )
3019  {
3020  // Save the thread and cache that executes the task.
3021  t->thread = i;
3022  t->cache = cache;
3023 
3024  // Execute the task.
3025  FLASH_Queue_exec_task( t );
3026 
3027  // Update task dependencies.
3028  FLASH_Task_update_dependencies( t, ( void* ) args );
3029  }
3030 
3031  RCCE_acquire_lock( 0 );
3032 
3033  // Terminate loop.
3034  if ( args->pc[0] >= n_tasks )
3035  condition = FALSE;
3036 
3037  RCCE_release_lock( 0 );
3038  }
3039 
3040  return ( void* ) NULL;
3041 }
int pc
Definition: FLASH_Queue_exec.c:96
int RCCE_release_lock(int)
int RCCE_ue(void)
Definition: FLA_type_defs.h:183
FLASH_Task * FLASH_Task_update_dependencies(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:2316
void FLASH_Queue_exec_task(FLASH_Task *t)
Definition: FLASH_Queue.c:1141
Definition: FLASH_Queue_exec.c:54
FLASH_Task * FLASH_Queue_wait_dequeue(int queue, int cache, void *arg)
Definition: FLASH_Queue_exec.c:678
int FLA_Bool
Definition: FLA_type_defs.h:46
unsigned int FLASH_Queue_get_num_tasks(void)
Definition: FLASH_Queue.c:284
int i
Definition: bl1_axmyv2.c:145
int cache
Definition: FLA_type_defs.h:193
int thread
Definition: FLA_type_defs.h:192
int RCCE_acquire_lock(int)
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223

◆ FLASH_Queue_exec_simulation()

void FLASH_Queue_exec_simulation ( void *  arg)

References FLASH_Task_s::cache, FLASH_Task_s::dep_arg_head, FLA_free(), FLA_malloc(), FLASH_Queue_exec_task(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_free(), i, FLASH_Task_s::n_dep_args, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Task_s::name, FLASH_Dep_s::next_dep, FLASH_Queue_variables::pc, RCCE_acquire_lock(), RCCE_release_lock(), RCCE_ue(), RCCE_wtime(), Synch_all(), FLASH_Dep_s::task, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec().

2595 {
2596  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2597  int i, j;
2598  int queue;
2599  int cache;
2600  int n_stages = 0;
2601  int n_queues = args->n_queues;
2602  int n_tasks = FLASH_Queue_get_num_tasks();
2603  int n_threads = FLASH_Queue_get_num_threads();
2604  int n_cores = FLASH_Queue_get_cores_per_cache();
2606  FLASH_Task* task;
2607  FLASH_Task* t;
2608  FLASH_Dep* d;
2609 
2610  // An array to hold tasks to be executed during of simulation.
2611 #ifdef FLA_ENABLE_WINDOWS_BUILD
2612  FLASH_Task** exec_array = ( FLASH_Task** ) FLA_malloc( n_threads * sizeof( FLASH_Task* ) );
2613 #else
2614  FLASH_Task* exec_array[n_threads];
2615 #endif
2616 
2617  for ( i = 0; i < n_threads; i++ )
2618  {
2619  // Initialize all exec_array to NULL.
2620  exec_array[i] = NULL;
2621 
2622  // Prefetch blocks into the cache before execution.
2623  if ( i % n_cores == 0 )
2624  FLASH_Queue_prefetch( i, arg );
2625  }
2626 
2627  // Loop until all the tasks have committed.
2628  while ( args->pc < n_tasks )
2629  {
2630  for ( i = 0; i < n_threads; i++ )
2631  {
2632  // Update waiting queue with ready tasks.
2633  t = exec_array[i];
2634 
2635  if ( t != NULL )
2636  {
2637  // Check each dependent task.
2638  d = t->dep_arg_head;
2639 
2640  for ( j = 0; j < t->n_dep_args; j++ )
2641  {
2642  task = d->task;
2643  task->n_ready--;
2644 
2645  // Place newly ready tasks on waiting queue.
2646  if ( task->n_ready == 0 )
2647  {
2648  FLASH_Queue_wait_enqueue( task, arg );
2649  }
2650 
2651  // Go to the next dep.
2652  d = d->next_dep;
2653  }
2654 
2655  // Free the task.
2656  FLASH_Task_free( t );
2657  }
2658  }
2659 
2660  n_stages++;
2661  if ( !verbose )
2662  printf( "%7d", n_stages );
2663 
2664  // Move ready tasks from the waiting queue to execution queue.
2665  for ( i = 0; i < n_threads; i++ )
2666  {
2667  // Determine to which queue this thread belongs.
2668  queue = i / ( n_threads / n_queues );
2669 
2670  // Determine to which cache this thread belongs.
2671  cache = i / n_cores;
2672 
2673  // Dequeue a task.
2674  t = FLASH_Queue_wait_dequeue( queue, cache, arg );
2675 
2676  // Save the task for execution.
2677  exec_array[i] = t;
2678 
2679  if ( t != NULL )
2680  {
2681  // Save the thread and cache that executes the task.
2682  t->thread = i;
2683  t->cache = cache;
2684 
2685  // Increment program counter.
2686  args->pc++;
2687  }
2688  }
2689 
2690  // Execute independent tasks.
2691  for ( i = 0; i < n_threads; i++ )
2692  {
2693  t = exec_array[i];
2694  FLASH_Queue_update_cache( t, arg );
2695  FLASH_Queue_exec_task( t );
2696 
2697  if ( !verbose )
2698  printf( "%7s", ( t == NULL ? " " : t->name ) );
2699 
2700  // Free the task if this is the last stage.
2701  if ( args->pc == n_tasks && t != NULL )
2702  FLASH_Task_free( t );
2703  }
2704 
2705  if ( !verbose )
2706  printf( "\n" );
2707  }
2708 
2709  if ( !verbose )
2710  printf( "\n" );
2711 
2712 #ifdef FLA_ENABLE_WINDOWS_BUILD
2713  FLA_free( exec_array );
2714 #endif
2715 
2716  return;
2717 }
int pc
Definition: FLASH_Queue_exec.c:96
FLASH_Dep * dep_arg_head
Definition: FLA_type_defs.h:232
Definition: FLA_type_defs.h:244
int n_ready
Definition: FLA_type_defs.h:186
void FLASH_Queue_prefetch(int cache, void *arg)
Definition: FLASH_Queue_exec.c:1024
void FLASH_Queue_update_cache(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:847
Definition: FLA_type_defs.h:183
int FLASH_Verbose
Definition: FLA_type_defs.h:113
FLASH_Task * task
Definition: FLA_type_defs.h:247
void FLA_free(void *ptr)
Definition: FLA_Memory.c:247
FLASH_Dep * next_dep
Definition: FLA_type_defs.h:250
int n_dep_args
Definition: FLA_type_defs.h:231
void * FLA_malloc(size_t size)
Definition: FLA_Memory.c:111
void FLASH_Queue_exec_task(FLASH_Task *t)
Definition: FLASH_Queue.c:1141
Definition: FLASH_Queue_exec.c:54
FLASH_Task * FLASH_Queue_wait_dequeue(int queue, int cache, void *arg)
Definition: FLASH_Queue_exec.c:678
void FLASH_Queue_wait_enqueue(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:626
int FLASH_Queue_get_cores_per_cache(void)
Definition: FLASH_Queue.c:548
void FLASH_Task_free(FLASH_Task *t)
Definition: FLASH_Queue.c:1020
unsigned int FLASH_Queue_get_num_tasks(void)
Definition: FLASH_Queue.c:284
int i
Definition: bl1_axmyv2.c:145
int cache
Definition: FLA_type_defs.h:193
int thread
Definition: FLA_type_defs.h:192
FLASH_Verbose FLASH_Queue_get_verbose_output(void)
Definition: FLASH_Queue.c:308
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223
char * name
Definition: FLA_type_defs.h:203
int n_queues
Definition: FLASH_Queue_exec.c:77

◆ FLASH_Queue_exec_task()

void FLASH_Queue_exec_task ( FLASH_Task t)

References FLASH_Task_s::cntl, FLA_Apply_CAQ2_UT_task(), FLA_Apply_pivots_macro_task(), FLA_Apply_Q2_UT_task(), FLA_Apply_Q_UT_task(), FLA_Apply_QUD_UT_task(), FLASH_Task_s::fla_arg, FLA_Axpy_task(), FLA_Axpyt_task(), FLA_CAQR2_UT_task(), FLA_Chol_task(), FLA_Copy_task(), FLA_Copyr_task(), FLA_Copyt_task(), FLA_Eig_gest_task(), FLA_Gemm_task(), FLA_Gemv_task(), FLA_Hemm_task(), FLA_Her2k_task(), FLA_Herk_task(), FLA_LQ_UT_macro_task(), FLA_LU_nopiv_task(), FLA_LU_piv_copy_task(), FLA_LU_piv_macro_task(), FLA_LU_piv_task(), FLA_Lyap_task(), FLA_Obj_create_buffer_task(), FLA_Obj_free_buffer_task(), FLA_QR2_UT_task(), FLA_QR_UT_copy_task(), FLA_QR_UT_macro_task(), FLA_QR_UT_task(), FLA_SA_FS_task(), FLA_SA_LU_task(), FLA_Scal_task(), FLA_Scalr_task(), FLA_Sylv_task(), FLA_Symm_task(), FLA_Syr2k_task(), FLA_Syrk_task(), FLA_Trinv_task(), FLA_Trmm_task(), FLA_Trsm_piv_task(), FLA_Trsm_task(), FLA_Trsv_task(), FLA_Ttmm_task(), FLA_UDdate_UT_task(), FLASH_Task_s::func, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, and FLASH_Task_s::output_arg.

Referenced by FLASH_Queue_exec_gpu(), FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

1147 {
1148  // Define local function pointer types.
1149 
1150  // LAPACK-level
1151  typedef FLA_Error(*flash_lu_piv_macro_p)(FLA_Obj A, FLA_Obj p, fla_lu_t* cntl );
1152  typedef FLA_Error(*flash_apply_pivots_macro_p)(FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl);
1153  typedef FLA_Error(*flash_lu_piv_p)(FLA_Obj A, FLA_Obj p, fla_lu_t* cntl);
1154  typedef FLA_Error(*flash_lu_piv_copy_p)(FLA_Obj A, FLA_Obj p, FLA_Obj U, fla_lu_t* cntl);
1155  typedef FLA_Error(*flash_trsm_piv_p)(FLA_Obj A, FLA_Obj C, FLA_Obj p, fla_trsm_t* cntl);
1156  typedef FLA_Error(*flash_sa_lu_p)(FLA_Obj U, FLA_Obj D, FLA_Obj p, FLA_Obj L, int nb_alg, fla_lu_t* cntl);
1157  typedef FLA_Error(*flash_sa_fs_p)(FLA_Obj L, FLA_Obj D, FLA_Obj p, FLA_Obj C, FLA_Obj E, int nb_alg, fla_gemm_t* cntl);
1158  typedef FLA_Error(*flash_lu_nopiv_p)(FLA_Obj A, fla_lu_t* cntl);
1159  typedef FLA_Error(*flash_trinv_p)(FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A, fla_trinv_t* cntl);
1160  typedef FLA_Error(*flash_ttmm_p)(FLA_Uplo uplo, FLA_Obj A, fla_ttmm_t* cntl);
1161  typedef FLA_Error(*flash_chol_p)(FLA_Uplo uplo, FLA_Obj A, fla_chol_t* cntl);
1162  typedef FLA_Error(*flash_sylv_p)(FLA_Trans transa, FLA_Trans transb, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl);
1163  typedef FLA_Error(*flash_lyap_p)(FLA_Trans trans, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl);
1164  typedef FLA_Error(*flash_qrut_macro_p)(FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl);
1165  typedef FLA_Error(*flash_qrut_p)(FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl);
1166  typedef FLA_Error(*flash_qrutc_p)(FLA_Obj A, FLA_Obj T, FLA_Obj U, fla_qrut_t* cntl);
1167  typedef FLA_Error(*flash_qr2ut_p)(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_qr2ut_t* cntl);
1168  typedef FLA_Error(*flash_lqut_macro_p)(FLA_Obj A, FLA_Obj T, fla_lqut_t* cntl);
1169  typedef FLA_Error(*flash_caqr2ut_p)(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_caqr2ut_t* cntl);
1170  typedef FLA_Error(*flash_uddateut_p)(FLA_Obj R, FLA_Obj C, FLA_Obj D, FLA_Obj T, fla_uddateut_t* cntl);
1171  typedef FLA_Error(*flash_apqut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl);
1172  typedef FLA_Error(*flash_apq2ut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apq2ut_t* cntl);
1173  typedef FLA_Error(*flash_apcaq2ut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apcaq2ut_t* cntl);
1174  typedef FLA_Error(*flash_apqudut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj T, FLA_Obj W, FLA_Obj R, FLA_Obj U, FLA_Obj C, FLA_Obj V, FLA_Obj D, fla_apqudut_t* cntl);
1175  typedef FLA_Error(*flash_eig_gest_p)(FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl);
1176 
1177  // Level-3 BLAS
1178  typedef FLA_Error(*flash_gemm_p)(FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl);
1179  typedef FLA_Error(*flash_hemm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl);
1180  typedef FLA_Error(*flash_herk_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl);
1181  typedef FLA_Error(*flash_her2k_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl);
1182  typedef FLA_Error(*flash_symm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl);
1183  typedef FLA_Error(*flash_syrk_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl);
1184  typedef FLA_Error(*flash_syr2k_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl);
1185  typedef FLA_Error(*flash_trmm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj C, fla_trmm_t* cntl);
1186  typedef FLA_Error(*flash_trsm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj C, fla_trsm_t* cntl);
1187 
1188  // Level-2 BLAS
1189  typedef FLA_Error(*flash_gemv_p)(FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl);
1190  typedef FLA_Error(*flash_trsv_p)(FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl);
1191 
1192  // Level-1 BLAS
1193  typedef FLA_Error(*flash_axpy_p)(FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpy_t* cntl);
1194  typedef FLA_Error(*flash_axpyt_p)(FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl);
1195  typedef FLA_Error(*flash_copy_p)(FLA_Obj A, FLA_Obj B, fla_copy_t* cntl);
1196  typedef FLA_Error(*flash_copyt_p)(FLA_Trans trans, FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl);
1197  typedef FLA_Error(*flash_copyr_p)(FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl);
1198  typedef FLA_Error(*flash_scal_p)(FLA_Obj alpha, FLA_Obj A, fla_scal_t* cntl);
1199  typedef FLA_Error(*flash_scalr_p)(FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl);
1200 
1201  // Base
1202  typedef FLA_Error(*flash_obj_create_buffer_p)(dim_t rs, dim_t cs, FLA_Obj A, void* cntl);
1203  typedef FLA_Error(*flash_obj_free_buffer_p)(FLA_Obj A, void* cntl);
1204 
1205  // Only execute task if it is not NULL.
1206  if ( t == NULL )
1207  return;
1208 
1209  // Now "switch" between the various possible task functions.
1210 
1211  // FLA_LU_piv_macro
1212  if ( t->func == (void *) FLA_LU_piv_macro_task )
1213  {
1214  flash_lu_piv_macro_p func;
1215  func = (flash_lu_piv_macro_p) t->func;
1216 
1217  func( t->output_arg[0],
1218  t->output_arg[1],
1219  ( fla_lu_t* ) t->cntl );
1220  }
1221  // FLA_Apply_pivots_macro
1222  else if ( t->func == (void *) FLA_Apply_pivots_macro_task )
1223  {
1224  flash_apply_pivots_macro_p func;
1225  func = (flash_apply_pivots_macro_p) t->func;
1226 
1227  func( ( FLA_Side ) t->int_arg[0],
1228  ( FLA_Trans ) t->int_arg[1],
1229  t->input_arg[0],
1230  t->output_arg[0],
1231  ( fla_appiv_t* ) t->cntl );
1232  }
1233  // FLA_LU_piv
1234  else if ( t->func == (void *) FLA_LU_piv_task )
1235  {
1236  flash_lu_piv_p func;
1237  func = (flash_lu_piv_p) t->func;
1238 
1239  func( t->output_arg[0],
1240  t->fla_arg[0],
1241  ( fla_lu_t* ) t->cntl );
1242  }
1243  // FLA_LU_piv_copy
1244  else if ( t->func == (void *) FLA_LU_piv_copy_task )
1245  {
1246  flash_lu_piv_copy_p func;
1247  func = (flash_lu_piv_copy_p) t->func;
1248 
1249  func( t->output_arg[0],
1250  t->fla_arg[0],
1251  t->output_arg[1],
1252  ( fla_lu_t* ) t->cntl );
1253  }
1254  // FLA_Trsm_piv
1255  else if ( t->func == (void *) FLA_Trsm_piv_task )
1256  {
1257  flash_trsm_piv_p func;
1258  func = (flash_trsm_piv_p) t->func;
1259 
1260  func( t->input_arg[0],
1261  t->output_arg[0],
1262  t->fla_arg[0],
1263  ( fla_trsm_t* ) t->cntl );
1264  }
1265  // FLA_SA_LU
1266  else if ( t->func == (void *) FLA_SA_LU_task )
1267  {
1268  flash_sa_lu_p func;
1269  func = (flash_sa_lu_p) t->func;
1270 
1271  func( t->output_arg[1],
1272  t->output_arg[0],
1273  t->fla_arg[0],
1274  t->fla_arg[1],
1275  t->int_arg[0],
1276  ( fla_lu_t* ) t->cntl );
1277  }
1278  // FLA_SA_FS
1279  else if ( t->func == (void *) FLA_SA_FS_task )
1280  {
1281  flash_sa_fs_p func;
1282  func = (flash_sa_fs_p) t->func;
1283 
1284  func( t->fla_arg[0],
1285  t->input_arg[0],
1286  t->fla_arg[1],
1287  t->output_arg[1],
1288  t->output_arg[0],
1289  t->int_arg[0],
1290  ( fla_gemm_t* ) t->cntl );
1291  }
1292  // FLA_LU_nopiv
1293  else if ( t->func == (void *) FLA_LU_nopiv_task )
1294  {
1295  flash_lu_nopiv_p func;
1296  func = (flash_lu_nopiv_p) t->func;
1297 
1298  func( t->output_arg[0],
1299  ( fla_lu_t* ) t->cntl );
1300  }
1301  // FLA_Trinv
1302  else if ( t->func == (void *) FLA_Trinv_task )
1303  {
1304  flash_trinv_p func;
1305  func = (flash_trinv_p) t->func;
1306 
1307  func( ( FLA_Uplo ) t->int_arg[0],
1308  ( FLA_Diag ) t->int_arg[1],
1309  t->output_arg[0],
1310  ( fla_trinv_t* ) t->cntl );
1311  }
1312  // FLA_Ttmm
1313  else if ( t->func == (void *) FLA_Ttmm_task )
1314  {
1315  flash_ttmm_p func;
1316  func = (flash_ttmm_p) t->func;
1317 
1318  func( ( FLA_Uplo ) t->int_arg[0],
1319  t->output_arg[0],
1320  ( fla_ttmm_t* ) t->cntl );
1321  }
1322  // FLA_Chol
1323  else if ( t->func == (void *) FLA_Chol_task )
1324  {
1325  flash_chol_p func;
1326  func = (flash_chol_p) t->func;
1327 
1328  func( ( FLA_Uplo ) t->int_arg[0],
1329  t->output_arg[0],
1330  ( fla_chol_t* ) t->cntl );
1331  }
1332  // FLA_Sylv
1333  else if ( t->func == (void *) FLA_Sylv_task )
1334  {
1335  flash_sylv_p func;
1336  func = (flash_sylv_p) t->func;
1337 
1338  func( ( FLA_Trans ) t->int_arg[0],
1339  ( FLA_Trans ) t->int_arg[1],
1340  t->fla_arg[0],
1341  t->input_arg[0],
1342  t->input_arg[1],
1343  t->output_arg[0],
1344  t->fla_arg[1],
1345  ( fla_sylv_t* ) t->cntl );
1346  }
1347  // FLA_Lyap
1348  else if ( t->func == (void *) FLA_Lyap_task )
1349  {
1350  flash_lyap_p func;
1351  func = (flash_lyap_p) t->func;
1352 
1353  func( ( FLA_Trans ) t->int_arg[0],
1354  t->fla_arg[0],
1355  t->input_arg[0],
1356  t->output_arg[0],
1357  t->fla_arg[1],
1358  ( fla_lyap_t* ) t->cntl );
1359  }
1360  // FLA_QR_UT_macro
1361  else if ( t->func == (void *) FLA_QR_UT_macro_task )
1362  {
1363  flash_qrut_macro_p func;
1364  func = (flash_qrut_macro_p) t->func;
1365 
1366  func( t->output_arg[0],
1367  t->output_arg[1],
1368  ( fla_qrut_t* ) t->cntl );
1369  }
1370  // FLA_QR_UT
1371  else if ( t->func == (void *) FLA_QR_UT_task )
1372  {
1373  flash_qrut_p func;
1374  func = (flash_qrut_p) t->func;
1375 
1376  func( t->output_arg[0],
1377  t->fla_arg[0],
1378  ( fla_qrut_t* ) t->cntl );
1379  }
1380  // FLA_QR_UT_copy
1381  else if ( t->func == (void *) FLA_QR_UT_copy_task )
1382  {
1383  flash_qrutc_p func;
1384  func = (flash_qrutc_p) t->func;
1385 
1386  func( t->output_arg[0],
1387  t->fla_arg[0],
1388  t->output_arg[1],
1389  ( fla_qrut_t* ) t->cntl );
1390  }
1391  // FLA_QR2_UT
1392  else if ( t->func == (void *) FLA_QR2_UT_task )
1393  {
1394  flash_qr2ut_p func;
1395  func = (flash_qr2ut_p) t->func;
1396 
1397  func( t->output_arg[1],
1398  t->output_arg[0],
1399  t->fla_arg[0],
1400  ( fla_qr2ut_t* ) t->cntl );
1401  }
1402  // FLA_LQ_UT_macro
1403  else if ( t->func == (void *) FLA_LQ_UT_macro_task )
1404  {
1405  flash_lqut_macro_p func;
1406  func = (flash_lqut_macro_p) t->func;
1407 
1408  func( t->output_arg[0],
1409  t->output_arg[1],
1410  ( fla_lqut_t* ) t->cntl );
1411  }
1412  // FLA_CAQR2_UT
1413  else if ( t->func == (void *) FLA_CAQR2_UT_task )
1414  {
1415  flash_caqr2ut_p func;
1416  func = (flash_caqr2ut_p) t->func;
1417 
1418  func( t->output_arg[1],
1419  t->output_arg[0],
1420  t->fla_arg[0],
1421  ( fla_caqr2ut_t* ) t->cntl );
1422  }
1423  // FLA_UDdate_UT
1424  else if ( t->func == (void *) FLA_UDdate_UT_task )
1425  {
1426  flash_uddateut_p func;
1427  func = (flash_uddateut_p) t->func;
1428 
1429  func( t->output_arg[0],
1430  t->output_arg[1],
1431  t->output_arg[2],
1432  t->output_arg[3],
1433  ( fla_uddateut_t* ) t->cntl );
1434  }
1435  // FLA_Apply_Q_UT
1436  else if ( t->func == (void *) FLA_Apply_Q_UT_task )
1437  {
1438  flash_apqut_p func;
1439  func = (flash_apqut_p) t->func;
1440 
1441  func( ( FLA_Side ) t->int_arg[0],
1442  ( FLA_Trans ) t->int_arg[1],
1443  ( FLA_Direct ) t->int_arg[2],
1444  ( FLA_Store ) t->int_arg[3],
1445  t->input_arg[0],
1446  t->fla_arg[0],
1447  t->output_arg[1],
1448  t->output_arg[0],
1449  ( fla_apqut_t* ) t->cntl );
1450  }
1451  // FLA_Apply_Q2_UT
1452  else if ( t->func == (void *) FLA_Apply_Q2_UT_task )
1453  {
1454  flash_apq2ut_p func;
1455  func = (flash_apq2ut_p) t->func;
1456 
1457  func( ( FLA_Side ) t->int_arg[0],
1458  ( FLA_Trans ) t->int_arg[1],
1459  ( FLA_Direct ) t->int_arg[2],
1460  ( FLA_Store ) t->int_arg[3],
1461  t->input_arg[0],
1462  t->fla_arg[0],
1463  t->output_arg[2],
1464  t->output_arg[1],
1465  t->output_arg[0],
1466  ( fla_apq2ut_t* ) t->cntl );
1467  }
1468  // FLA_Apply_CAQ2_UT
1469  else if ( t->func == (void *) FLA_Apply_CAQ2_UT_task )
1470  {
1471  flash_apcaq2ut_p func;
1472  func = (flash_apcaq2ut_p) t->func;
1473 
1474  func( ( FLA_Side ) t->int_arg[0],
1475  ( FLA_Trans ) t->int_arg[1],
1476  ( FLA_Direct ) t->int_arg[2],
1477  ( FLA_Store ) t->int_arg[3],
1478  t->input_arg[0],
1479  t->fla_arg[0],
1480  t->output_arg[2],
1481  t->output_arg[1],
1482  t->output_arg[0],
1483  ( fla_apcaq2ut_t* ) t->cntl );
1484  }
1485  // FLA_Apply_QUD_UT
1486  else if ( t->func == (void *) FLA_Apply_QUD_UT_task )
1487  {
1488  flash_apqudut_p func;
1489  func = (flash_apqudut_p) t->func;
1490 
1491  func( ( FLA_Side ) t->int_arg[0],
1492  ( FLA_Trans ) t->int_arg[1],
1493  ( FLA_Direct ) t->int_arg[2],
1494  ( FLA_Store ) t->int_arg[3],
1495  t->input_arg[0],
1496  t->output_arg[0],
1497  t->output_arg[1],
1498  t->input_arg[1],
1499  t->output_arg[2],
1500  t->input_arg[2],
1501  t->output_arg[3],
1502  ( fla_apqudut_t* ) t->cntl );
1503  }
1504  // FLA_Eig_gest
1505  else if ( t->func == (void *) FLA_Eig_gest_task )
1506  {
1507  flash_eig_gest_p func;
1508  func = (flash_eig_gest_p) t->func;
1509 
1510  func( ( FLA_Inv ) t->int_arg[0],
1511  ( FLA_Uplo ) t->int_arg[1],
1512  t->output_arg[1],
1513  t->output_arg[0],
1514  t->input_arg[0],
1515  ( fla_eig_gest_t* ) t->cntl );
1516  }
1517  // FLA_Gemm
1518  else if ( t->func == (void *) FLA_Gemm_task )
1519  {
1520  flash_gemm_p func;
1521  func = (flash_gemm_p) t->func;
1522 
1523  func( ( FLA_Trans ) t->int_arg[0],
1524  ( FLA_Trans ) t->int_arg[1],
1525  t->fla_arg[0],
1526  t->input_arg[0],
1527  t->input_arg[1],
1528  t->fla_arg[1],
1529  t->output_arg[0],
1530  ( fla_gemm_t* ) t->cntl );
1531  }
1532  // FLA_Hemm
1533  else if ( t->func == (void *) FLA_Hemm_task )
1534  {
1535  flash_hemm_p func;
1536  func = (flash_hemm_p) t->func;
1537 
1538  func( ( FLA_Side ) t->int_arg[0],
1539  ( FLA_Uplo ) t->int_arg[1],
1540  t->fla_arg[0],
1541  t->input_arg[0],
1542  t->input_arg[1],
1543  t->fla_arg[1],
1544  t->output_arg[0],
1545  ( fla_hemm_t* ) t->cntl );
1546  }
1547  // FLA_Herk
1548  else if ( t->func == (void *) FLA_Herk_task )
1549  {
1550  flash_herk_p func;
1551  func = (flash_herk_p) t->func;
1552 
1553  func( ( FLA_Uplo ) t->int_arg[0],
1554  ( FLA_Trans ) t->int_arg[1],
1555  t->fla_arg[0],
1556  t->input_arg[0],
1557  t->fla_arg[1],
1558  t->output_arg[0],
1559  ( fla_herk_t* ) t->cntl );
1560  }
1561  // FLA_Her2k
1562  else if ( t->func == (void *) FLA_Her2k_task )
1563  {
1564  flash_her2k_p func;
1565  func = (flash_her2k_p) t->func;
1566 
1567  func( ( FLA_Uplo ) t->int_arg[0],
1568  ( FLA_Trans ) t->int_arg[1],
1569  t->fla_arg[0],
1570  t->input_arg[0],
1571  t->input_arg[1],
1572  t->fla_arg[1],
1573  t->output_arg[0],
1574  ( fla_her2k_t* ) t->cntl );
1575  }
1576  // FLA_Symm
1577  else if ( t->func == (void *) FLA_Symm_task )
1578  {
1579  flash_symm_p func;
1580  func = (flash_symm_p) t->func;
1581 
1582  func( ( FLA_Side ) t->int_arg[0],
1583  ( FLA_Uplo ) t->int_arg[1],
1584  t->fla_arg[0],
1585  t->input_arg[0],
1586  t->input_arg[1],
1587  t->fla_arg[1],
1588  t->output_arg[0],
1589  ( fla_symm_t* ) t->cntl );
1590  }
1591  // FLA_Syrk
1592  else if ( t->func == (void *) FLA_Syrk_task )
1593  {
1594  flash_syrk_p func;
1595  func = (flash_syrk_p) t->func;
1596 
1597  func( ( FLA_Uplo ) t->int_arg[0],
1598  ( FLA_Trans ) t->int_arg[1],
1599  t->fla_arg[0],
1600  t->input_arg[0],
1601  t->fla_arg[1],
1602  t->output_arg[0],
1603  ( fla_syrk_t* ) t->cntl );
1604  }
1605  // FLA_Syr2k
1606  else if ( t->func == (void *) FLA_Syr2k_task )
1607  {
1608  flash_syr2k_p func;
1609  func = (flash_syr2k_p) t->func;
1610 
1611  func( ( FLA_Uplo ) t->int_arg[0],
1612  ( FLA_Trans ) t->int_arg[1],
1613  t->fla_arg[0],
1614  t->input_arg[0],
1615  t->input_arg[1],
1616  t->fla_arg[1],
1617  t->output_arg[0],
1618  ( fla_syr2k_t* ) t->cntl );
1619  }
1620  // FLA_Trmm
1621  else if ( t->func == (void *) FLA_Trmm_task )
1622  {
1623  flash_trmm_p func;
1624  func = (flash_trmm_p) t->func;
1625 
1626  func( ( FLA_Side ) t->int_arg[0],
1627  ( FLA_Uplo ) t->int_arg[1],
1628  ( FLA_Trans ) t->int_arg[2],
1629  ( FLA_Diag ) t->int_arg[3],
1630  t->fla_arg[0],
1631  t->input_arg[0],
1632  t->output_arg[0],
1633  ( fla_trmm_t* ) t->cntl );
1634  }
1635  // FLA_Trsm
1636  else if ( t->func == (void *) FLA_Trsm_task )
1637  {
1638  flash_trsm_p func;
1639  func = (flash_trsm_p) t->func;
1640 
1641  func( ( FLA_Side ) t->int_arg[0],
1642  ( FLA_Uplo ) t->int_arg[1],
1643  ( FLA_Trans ) t->int_arg[2],
1644  ( FLA_Diag ) t->int_arg[3],
1645  t->fla_arg[0],
1646  t->input_arg[0],
1647  t->output_arg[0],
1648  ( fla_trsm_t* ) t->cntl );
1649  }
1650  // FLA_Gemv
1651  else if ( t->func == (void *) FLA_Gemv_task )
1652  {
1653  flash_gemv_p func;
1654  func = (flash_gemv_p) t->func;
1655 
1656  func( ( FLA_Trans ) t->int_arg[0],
1657  t->fla_arg[0],
1658  t->input_arg[0],
1659  t->input_arg[1],
1660  t->fla_arg[1],
1661  t->output_arg[0],
1662  ( fla_gemv_t* ) t->cntl );
1663  }
1664  // FLA_Trsv
1665  else if ( t->func == (void *) FLA_Trsv_task )
1666  {
1667  flash_trsv_p func;
1668  func = (flash_trsv_p) t->func;
1669 
1670  func( ( FLA_Uplo ) t->int_arg[0],
1671  ( FLA_Trans ) t->int_arg[1],
1672  ( FLA_Diag ) t->int_arg[2],
1673  t->input_arg[0],
1674  t->output_arg[0],
1675  ( fla_trsv_t* ) t->cntl );
1676  }
1677  // FLA_Axpy
1678  else if ( t->func == (void *) FLA_Axpy_task )
1679  {
1680  flash_axpy_p func;
1681  func = (flash_axpy_p) t->func;
1682 
1683  func( t->fla_arg[0],
1684  t->input_arg[0],
1685  t->output_arg[0],
1686  ( fla_axpy_t* ) t->cntl );
1687  }
1688  // FLA_Axpyt
1689  else if ( t->func == (void *) FLA_Axpyt_task )
1690  {
1691  flash_axpyt_p func;
1692  func = (flash_axpyt_p) t->func;
1693 
1694  func( ( FLA_Trans ) t->int_arg[0],
1695  t->fla_arg[0],
1696  t->input_arg[0],
1697  t->output_arg[0],
1698  ( fla_axpyt_t* ) t->cntl );
1699  }
1700  // FLA_Copy
1701  else if ( t->func == (void *) FLA_Copy_task )
1702  {
1703  flash_copy_p func;
1704  func = (flash_copy_p) t->func;
1705 
1706  func( t->input_arg[0],
1707  t->output_arg[0],
1708  ( fla_copy_t* ) t->cntl );
1709  }
1710  // FLA_Copyt
1711  else if ( t->func == (void *) FLA_Copyt_task )
1712  {
1713  flash_copyt_p func;
1714  func = (flash_copyt_p) t->func;
1715 
1716  func( ( FLA_Trans ) t->int_arg[0],
1717  t->input_arg[0],
1718  t->output_arg[0],
1719  ( fla_copyt_t* ) t->cntl );
1720  }
1721  // FLA_Copyr
1722  else if ( t->func == (void *) FLA_Copyr_task )
1723  {
1724  flash_copyr_p func;
1725  func = (flash_copyr_p) t->func;
1726 
1727  func( ( FLA_Uplo ) t->int_arg[0],
1728  t->input_arg[0],
1729  t->output_arg[0],
1730  ( fla_copyr_t* ) t->cntl );
1731  }
1732  // FLA_Scal
1733  else if ( t->func == (void *) FLA_Scal_task )
1734  {
1735  flash_scal_p func;
1736  func = (flash_scal_p) t->func;
1737 
1738  func( t->fla_arg[0],
1739  t->output_arg[0],
1740  ( fla_scal_t* ) t->cntl );
1741  }
1742  // FLA_Scalr
1743  else if ( t->func == (void *) FLA_Scalr_task )
1744  {
1745  flash_scalr_p func;
1746  func = (flash_scalr_p) t->func;
1747 
1748  func( ( FLA_Uplo ) t->int_arg[0],
1749  t->fla_arg[0],
1750  t->output_arg[0],
1751  ( fla_scalr_t* ) t->cntl );
1752  }
1753  // FLA_Obj_create_buffer
1754  else if ( t->func == (void *) FLA_Obj_create_buffer_task )
1755  {
1756  flash_obj_create_buffer_p func;
1757  func = (flash_obj_create_buffer_p) t->func;
1758 
1759  func( ( dim_t ) t->int_arg[0],
1760  ( dim_t ) t->int_arg[1],
1761  t->output_arg[0],
1762  t->cntl );
1763  }
1764  // FLA_Obj_free_buffer
1765  else if ( t->func == (void *) FLA_Obj_free_buffer_task )
1766  {
1767  flash_obj_free_buffer_p func;
1768  func = (flash_obj_free_buffer_p) t->func;
1769 
1770  func( t->output_arg[0],
1771  t->cntl );
1772  }
1773  else
1774  {
1775  FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED );
1776  }
1777 
1778  return;
1779 }
FLA_Error FLA_Copyt_task(FLA_Trans trans, FLA_Obj A, FLA_Obj B, fla_copyt_t *cntl)
Definition: FLA_Copyt_task.c:13
FLA_Error FLA_Gemm_task(FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t *cntl)
Definition: FLA_Gemm_task.c:13
int * int_arg
Definition: FLA_type_defs.h:210
Definition: FLA_Cntl_lapack.h:42
FLA_Error FLA_SA_LU_task(FLA_Obj U, FLA_Obj D, FLA_Obj p, FLA_Obj L, dim_t nb_alg, fla_lu_t *cntl)
Definition: FLA_SA_LU_task.c:13
FLA_Error FLA_Gemv_task(FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t *cntl)
Definition: FLA_Gemv_task.c:13
Definition: FLA_Cntl_blas1.h:77
Definition: FLA_Cntl_lapack.h:105
FLA_Error FLA_Apply_pivots_macro_task(FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A, fla_appiv_t *cntl)
Definition: FLA_Apply_pivots_macro_task.c:15
FLA_Error FLA_QR_UT_task(FLA_Obj A, FLA_Obj T, fla_qrut_t *cntl)
Definition: FLA_QR_UT_task.c:15
Definition: FLA_Cntl_blas3.h:65
Definition: FLA_Cntl_blas3.h:90
unsigned long dim_t
Definition: FLA_type_defs.h:71
Definition: FLA_Cntl_lapack.h:162
FLA_Error FLA_Lyap_task(FLA_Trans trans, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t *cntl)
Definition: FLA_Lyap_task.c:15
Definition: FLA_Cntl_lapack.h:80
Definition: FLA_Cntl_blas3.h:27
Definition: FLA_Cntl_blas3.h:115
Definition: FLA_Cntl_blas3.h:40
FLA_Error FLA_SA_FS_task(FLA_Obj L, FLA_Obj D, FLA_Obj p, FLA_Obj C, FLA_Obj E, dim_t nb_alg, fla_gemm_t *cntl)
Definition: FLA_SA_FS_task.c:13
FLA_Error FLA_Eig_gest_task(FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t *cntl)
Definition: FLA_Eig_gest_task.c:16
Definition: FLA_Cntl_blas3.h:103
FLA_Error FLA_LU_piv_copy_task(FLA_Obj A, FLA_Obj p, FLA_Obj U, fla_lu_t *cntl)
Definition: FLA_LU_piv_copy_task.c:13
FLA_Error FLA_LU_piv_task(FLA_Obj A, FLA_Obj p, fla_lu_t *cntl)
Definition: FLA_LU_piv_task.c:15
FLA_Error FLA_CAQR2_UT_task(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_caqr2ut_t *cntl)
Definition: FLA_CAQR2_UT_task.c:15
Definition: FLA_Cntl_blas1.h:26
FLA_Error FLA_Axpyt_task(FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t *cntl)
Definition: FLA_Axpyt_task.c:13
FLA_Error FLA_Herk_task(FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t *cntl)
Definition: FLA_Herk_task.c:13
FLA_Error FLA_Hemm_task(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t *cntl)
Definition: FLA_Hemm_task.c:13
Definition: FLA_Cntl_lapack.h:16
FLA_Error FLA_Symm_task(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t *cntl)
Definition: FLA_Symm_task.c:13
FLA_Error FLA_UDdate_UT_task(FLA_Obj R, FLA_Obj C, FLA_Obj D, FLA_Obj T, fla_uddateut_t *cntl)
Definition: FLA_UDdate_UT_task.c:15
void * cntl
Definition: FLA_type_defs.h:200
Definition: FLA_Cntl_blas1.h:46
FLA_Error FLA_QR_UT_copy_task(FLA_Obj A, FLA_Obj T, FLA_Obj U, fla_qrut_t *cntl)
Definition: FLA_QR_UT_copy_task.c:15
int FLA_Direct
Definition: FLA_type_defs.h:58
FLA_Error FLA_LU_nopiv_task(FLA_Obj A, fla_lu_t *cntl)
Definition: FLA_LU_nopiv_task.c:15
int FLA_Diag
Definition: FLA_type_defs.h:55
Definition: FLA_Cntl_blas1.h:56
Definition: FLA_Cntl_lapack.h:317
FLA_Error FLA_Syr2k_task(FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t *cntl)
Definition: FLA_Syr2k_task.c:13
int FLA_Error
Definition: FLA_type_defs.h:47
FLA_Error FLA_Apply_Q_UT_task(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t *cntl)
Definition: FLA_Apply_Q_UT_task.c:15
FLA_Error FLA_Trsm_task(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t *cntl)
Definition: FLA_Trsm_task.c:13
FLA_Error FLA_Scal_task(FLA_Obj alpha, FLA_Obj A, fla_scal_t *cntl)
Definition: FLA_Scal_task.c:13
Definition: FLA_Cntl_lapack.h:210
Definition: FLA_Cntl_lapack.h:95
FLA_Error FLA_Trinv_task(FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A, fla_trinv_t *cntl)
Definition: FLA_Trinv_task.c:15
Definition: FLA_Cntl_blas1.h:67
Definition: FLA_type_defs.h:158
Definition: FLA_Cntl_blas2.h:26
int FLA_Store
Definition: FLA_type_defs.h:59
FLA_Error FLA_Trsm_piv_task(FLA_Obj A, FLA_Obj B, FLA_Obj p, fla_trsm_t *cntl)
Definition: FLA_Trsm_piv_task.c:13
FLA_Error FLA_LU_piv_macro_task(FLA_Obj A, FLA_Obj p, fla_lu_t *cntl)
Definition: FLA_LU_piv_macro_task.c:13
FLA_Error FLA_Sylv_task(FLA_Trans transa, FLA_Trans transb, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t *cntl)
Definition: FLA_Sylv_task.c:15
Definition: FLA_Cntl_lapack.h:227
FLA_Error FLA_Copyr_task(FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, fla_copyr_t *cntl)
Definition: FLA_Copyr_task.c:13
Definition: FLA_Cntl_blas3.h:78
FLA_Error FLA_Apply_CAQ2_UT_task(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apcaq2ut_t *cntl)
Definition: FLA_Apply_CAQ2_UT_task.c:15
FLA_Error FLA_Obj_free_buffer_task(FLA_Obj obj, void *cntl)
Definition: FLA_Obj_free_buffer_task.c:13
FLA_Error FLA_Apply_Q2_UT_task(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apq2ut_t *cntl)
Definition: FLA_Apply_Q2_UT_task.c:15
FLA_Error FLA_Axpy_task(FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpy_t *cntl)
Definition: FLA_Axpy_task.c:13
FLA_Error FLA_QR_UT_macro_task(FLA_Obj A, FLA_Obj T, fla_qrut_t *cntl)
Definition: FLA_QR_UT_macro_task.c:15
FLA_Error FLA_QR2_UT_task(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_qr2ut_t *cntl)
Definition: FLA_QR2_UT_task.c:15
FLA_Error FLA_Trsv_task(FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t *cntl)
Definition: FLA_Trsv_task.c:13
Definition: FLA_Cntl_lapack.h:148
Definition: FLA_Cntl_lapack.h:263
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
Definition: FLA_Cntl_lapack.h:182
int FLA_Trans
Definition: FLA_type_defs.h:53
FLA_Error FLA_Apply_QUD_UT_task(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj T, FLA_Obj W, FLA_Obj R, FLA_Obj U, FLA_Obj C, FLA_Obj V, FLA_Obj D, fla_apqudut_t *cntl)
Definition: FLA_Apply_QUD_UT_task.c:15
FLA_Error FLA_Obj_create_buffer_task(dim_t rs, dim_t cs, FLA_Obj obj, void *cntl)
Definition: FLA_Obj_create_buffer_task.c:13
int FLA_Uplo
Definition: FLA_type_defs.h:52
int FLA_Side
Definition: FLA_type_defs.h:51
int FLA_Inv
Definition: FLA_type_defs.h:63
Definition: FLA_Cntl_lapack.h:29
Definition: FLA_Cntl_blas3.h:52
Definition: FLA_Cntl_blas1.h:16
FLA_Obj * input_arg
Definition: FLA_type_defs.h:218
Definition: FLA_Cntl_lapack.h:69
Definition: FLA_Cntl_lapack.h:306
FLA_Error FLA_Copy_task(FLA_Obj A, FLA_Obj B, fla_copy_t *cntl)
Definition: FLA_Copy_task.c:13
Definition: FLA_Cntl_blas1.h:36
Definition: FLA_Cntl_blas3.h:16
FLA_Error FLA_Syrk_task(FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t *cntl)
Definition: FLA_Syrk_task.c:13
Definition: FLA_Cntl_blas2.h:16
FLA_Obj * fla_arg
Definition: FLA_type_defs.h:214
FLA_Error FLA_Scalr_task(FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, fla_scalr_t *cntl)
Definition: FLA_Scalr_task.c:13
FLA_Error FLA_LQ_UT_macro_task(FLA_Obj A, FLA_Obj T, fla_lqut_t *cntl)
Definition: FLA_LQ_UT_macro_task.c:15
FLA_Error FLA_Trmm_task(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t *cntl)
Definition: FLA_Trmm_task.c:13
void * func
Definition: FLA_type_defs.h:197
FLA_Error FLA_Ttmm_task(FLA_Uplo uplo, FLA_Obj A, fla_ttmm_t *cntl)
Definition: FLA_Ttmm_task.c:15
FLA_Error FLA_Chol_task(FLA_Uplo uplo, FLA_Obj A, fla_chol_t *cntl)
Definition: FLA_Chol_task.c:15
Definition: FLA_Cntl_lapack.h:52
Definition: FLA_Cntl_lapack.h:355
FLA_Error FLA_Her2k_task(FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t *cntl)
Definition: FLA_Her2k_task.c:13

◆ FLASH_Queue_finalize()

void FLASH_Queue_finalize ( void  )
268 {
269  // Exit early if we're not already initialized.
270  if ( flash_queue_initialized == FALSE )
271  return;
272 
273  // Clear the initialized flag.
274  flash_queue_initialized = FALSE;
275 
276 #ifdef FLA_ENABLE_GPU
278 #endif
279 
280  return;
281 }
void FLASH_Queue_finalize_gpu(void)
Definition: FLASH_Queue_gpu.c:36

◆ FLASH_Queue_flush_block_gpu()

void FLASH_Queue_flush_block_gpu ( FLA_Obj  obj,
int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_gpu().

1899 {
1900  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1901  int k;
1902  dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1903  FLA_Bool transfer = FALSE;
1904  FLA_Obj_gpu gpu_obj;
1905 
1906 #ifdef FLA_ENABLE_MULTITHREADING
1907  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1908 #endif
1909 
1910  // Locate the position of the block on the GPU.
1911  for ( k = 0; k < gpu_n_blocks; k++ )
1912  if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1913  break;
1914 
1915  // The block is owned by the GPU.
1916  if ( k < gpu_n_blocks )
1917  {
1918  // Save the block that will be flushed.
1919  gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1920 
1921  // If the block is dirty, then flush it.
1922  if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
1923  transfer = TRUE;
1924  }
1925 
1926 #ifdef FLA_ENABLE_MULTITHREADING
1927  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1928 #endif
1929 
1930  // Exit early if a flush is not required.
1931  if ( !transfer )
1932  return;
1933 
1934  // Flush the block outside the critical section.
1935  FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1936 
1937 #ifdef FLA_ENABLE_MULTITHREADING
1938  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1939 #endif
1940 
1941  // Locate the position of the block on the GPU.
1942  for ( k = 0; k < gpu_n_blocks; k++ )
1943  if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1944  break;
1945 
1946  if ( k < gpu_n_blocks )
1947  {
1948  // Update the bits for the flushed block.
1949  args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
1950  args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1951  }
1952 
1953 #ifdef FLA_ENABLE_MULTITHREADING
1954  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1955 #endif
1956 
1957  return;
1958 }
FLA_Bool request
Definition: FLASH_Queue_exec.c:49
unsigned long dim_t
Definition: FLA_type_defs.h:71
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
void * buffer_gpu
Definition: FLASH_Queue_exec.c:43
FLA_Error FLASH_Queue_read_gpu(FLA_Obj obj, void *buffer_gpu)
Definition: FLASH_Queue_gpu.c:205
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLA_Bool clean
Definition: FLASH_Queue_exec.c:46
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
Definition: FLASH_Queue_exec.c:54
Definition: FLASH_Queue_exec.c:37
int FLA_Bool
Definition: FLA_type_defs.h:46
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
FLA_Lock * gpu_lock
Definition: FLASH_Queue_exec.c:101
FLA_Obj obj
Definition: FLASH_Queue_exec.c:40

◆ FLASH_Queue_flush_gpu()

void FLASH_Queue_flush_gpu ( int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Queue_variables::gpu_log, i, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_parallel_function().

1967 {
1968  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1969  int i, k;
1970  dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1971  int n_transfer = 0;
1972  FLA_Obj_gpu gpu_obj;
1973 
1974  // Exit if not using GPU.
1975  if ( !FLASH_Queue_get_enabled_gpu() )
1976  return;
1977 
1978 #ifdef FLA_ENABLE_MULTITHREADING
1979  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1980 #endif
1981 
1982  for ( k = 0; k < gpu_n_blocks; k++ )
1983  {
1984  // Save the block that might be flushed.
1985  gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1986 
1987  // Flush the block if it is dirty and requested.
1988  if ( gpu_obj.obj.base != NULL && !gpu_obj.clean && gpu_obj.request )
1989  {
1990  // Save the block for data transfer outside the critical section.
1991  args->gpu_log[thread * gpu_n_blocks + n_transfer] = gpu_obj;
1992  n_transfer++;
1993  }
1994  }
1995 
1996 #ifdef FLA_ENABLE_MULTITHREADING
1997  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1998 #endif
1999 
2000  // Exit early if a flush is not required.
2001  if ( n_transfer == 0 )
2002  return;
2003 
2004  // Flush the block outside the critical section.
2005  for ( i = 0; i < n_transfer; i++ )
2006  {
2007  gpu_obj = args->gpu_log[thread * gpu_n_blocks + i];
2008  FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
2009  }
2010 
2011 #ifdef FLA_ENABLE_MULTITHREADING
2012  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
2013 #endif
2014 
2015  // Update the bits for each block that is flushed.
2016  for ( i = 0; i < n_transfer; i++ )
2017  {
2018  // Locate the position of the block on the GPU.
2019  for ( k = 0; k < gpu_n_blocks; k++ )
2020  if ( args->gpu_log[thread * gpu_n_blocks + i].obj.base ==
2021  args->gpu[thread * gpu_n_blocks + k].obj.base )
2022  break;
2023 
2024  if ( k < gpu_n_blocks )
2025  {
2026  // The block is now clean.
2027  args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
2028  args->gpu[thread * gpu_n_blocks + k].request = FALSE;
2029  }
2030  }
2031 
2032 #ifdef FLA_ENABLE_MULTITHREADING
2033  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
2034 #endif
2035 
2036  return;
2037 }
FLA_Obj_gpu * gpu_log
Definition: FLASH_Queue_exec.c:110
FLA_Bool request
Definition: FLASH_Queue_exec.c:49
unsigned long dim_t
Definition: FLA_type_defs.h:71
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
FLA_Bool FLASH_Queue_get_enabled_gpu(void)
Definition: FLASH_Queue_gpu.c:91
void * buffer_gpu
Definition: FLASH_Queue_exec.c:43
FLA_Error FLASH_Queue_read_gpu(FLA_Obj obj, void *buffer_gpu)
Definition: FLASH_Queue_gpu.c:205
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLA_Bool clean
Definition: FLASH_Queue_exec.c:46
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
Definition: FLASH_Queue_exec.c:54
Definition: FLASH_Queue_exec.c:37
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
FLA_Lock * gpu_lock
Definition: FLASH_Queue_exec.c:101
int i
Definition: bl1_axmyv2.c:145
FLA_Obj obj
Definition: FLASH_Queue_exec.c:40

◆ FLASH_Queue_get_block_size()

dim_t FLASH_Queue_get_block_size ( void  )

Referenced by FLASH_Queue_exec().

482 {
483  return flash_queue_block_size;
484 }

◆ FLASH_Queue_get_cache_line_size()

dim_t FLASH_Queue_get_cache_line_size ( void  )

Referenced by FLASH_Queue_prefetch_block().

530 {
531  return flash_queue_cache_line_size;
532 }

◆ FLASH_Queue_get_cache_size()

dim_t FLASH_Queue_get_cache_size ( void  )

Referenced by FLASH_Queue_exec().

506 {
507  return flash_queue_cache_size;
508 }

◆ FLASH_Queue_get_caching()

FLA_Bool FLASH_Queue_get_caching ( void  )

Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_wait_dequeue(), and FLASH_Task_update_dependencies().

362 {
363  return flash_queue_caching;
364 }

◆ FLASH_Queue_get_cores_per_cache()

int FLASH_Queue_get_cores_per_cache ( void  )

Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

554 {
555  return flash_queue_cores_per_cache;
556 }

◆ FLASH_Queue_get_cores_per_queue()

int FLASH_Queue_get_cores_per_queue ( void  )

Referenced by FLASH_Queue_exec().

578 {
579  return flash_queue_cores_per_queue;
580 }

◆ FLASH_Queue_get_data_affinity()

FLASH_Data_aff FLASH_Queue_get_data_affinity ( void  )

Referenced by FLASH_Queue_exec(), FLASH_Queue_init_tasks(), and FLASH_Queue_verbose_output().

410 {
411  return flash_queue_data_affinity;
412 }

◆ FLASH_Queue_get_enabled()

FLA_Bool FLASH_Queue_get_enabled ( void  )

◆ FLASH_Queue_get_head_task()

FLASH_Task* FLASH_Queue_get_head_task ( void  )

References FLASH_Queue_s::head.

Referenced by FLASH_Queue_init_tasks(), and FLASH_Queue_verbose_output().

609 {
610  return _tq.head;
611 }
FLASH_Queue _tq
Definition: FLASH_Queue.c:27
FLASH_Task * head
Definition: FLA_type_defs.h:179

◆ FLASH_Queue_get_num_tasks()

unsigned int FLASH_Queue_get_num_tasks ( void  )

References FLASH_Queue_s::n_tasks.

Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), FLASH_Queue_init_tasks(), and FLASH_Queue_verbose_output().

290 {
291  return _tq.n_tasks;
292 }
FLASH_Queue _tq
Definition: FLASH_Queue.c:27
unsigned int n_tasks
Definition: FLA_type_defs.h:176

◆ FLASH_Queue_get_num_threads()

unsigned int FLASH_Queue_get_num_threads ( void  )

◆ FLASH_Queue_get_parallel_time()

double FLASH_Queue_get_parallel_time ( void  )
436 {
437  // Only return time if out of parallel region.
438  if ( flash_queue_stack == 0 )
439  return flash_queue_parallel_time;
440 
441  return 0.0;
442 }

◆ FLASH_Queue_get_sorting()

FLA_Bool FLASH_Queue_get_sorting ( void  )

Referenced by FLASH_Queue_wait_enqueue(), and FLASH_Task_update_binding().

338 {
339  return flash_queue_sorting;
340 }

◆ FLASH_Queue_get_tail_task()

FLASH_Task* FLASH_Queue_get_tail_task ( void  )

References FLASH_Queue_s::tail.

Referenced by FLASH_Queue_init_tasks().

620 {
621  return _tq.tail;
622 }
FLASH_Queue _tq
Definition: FLASH_Queue.c:27
FLASH_Task * tail
Definition: FLA_type_defs.h:180

◆ FLASH_Queue_get_total_time()

double FLASH_Queue_get_total_time ( void  )
421 {
422  // Only return time if out of parallel region.
423  if ( flash_queue_stack == 0 )
424  return flash_queue_total_time;
425 
426  return 0.0;
427 }

◆ FLASH_Queue_get_verbose_output()

FLASH_Verbose FLASH_Queue_get_verbose_output ( void  )

Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_simulation(), and FLASH_Queue_verbose_output().

314 {
315  return flash_queue_verbose;
316 }

◆ FLASH_Queue_get_work_stealing()

FLA_Bool FLASH_Queue_get_work_stealing ( void  )

Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), and FLASH_Task_update_dependencies().

386 {
387  return flash_queue_work_stealing;
388 }

◆ FLASH_Queue_init()

void FLASH_Queue_init ( void  )
243 {
244  // Exit early if we're already initialized.
245  if ( flash_queue_initialized == TRUE )
246  return;
247 
248  // Reset all the initial values.
250 
251  // Set the initialized flag.
252  flash_queue_initialized = TRUE;
253 
254 #ifdef FLA_ENABLE_GPU
256 #endif
257 
258  return;
259 }
void FLASH_Queue_reset(void)
Definition: FLASH_Queue.c:583
void FLASH_Queue_init_gpu(void)
Definition: FLASH_Queue_gpu.c:23

◆ FLASH_Queue_init_tasks()

void FLASH_Queue_init_tasks ( void *  arg)

References FLA_Obj_view::base, FLASH_Queue_variables::block_size, FLASH_Queue_variables::datatype, FLASH_Task_s::dep_arg_head, FLA_is_owner(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_datatype_size(), FLA_Obj_elemtype(), FLA_Obj_free_buffer_task(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_data_affinity(), FLASH_Queue_get_head_task(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_tail_task(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::func, FLASH_Task_s::height, i, FLASH_Task_s::input_arg, FLA_Obj_struct::m_index, FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_index, FLASH_Task_s::n_input_args, FLASH_Task_s::n_macro_args, FLASH_Task_s::n_output_args, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Task_s::n_war_args, FLA_Obj_struct::n_write_blocks, FLASH_Dep_s::next_dep, FLASH_Task_s::next_task, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLASH_Queue_variables::prefetch, FLASH_Task_s::prev_task, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::size, FLASH_Dep_s::task, and FLASH_Queue_variables::task_queue.

Referenced by FLASH_Queue_exec().

2836 {
2837  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2838  int i, j;
2839  int n_tasks = FLASH_Queue_get_num_tasks();
2840  int n_ready = 0;
2841  int height;
2842  FLASH_Task* t;
2843  FLASH_Dep* d;
2844 
2845  // Grab the tail of the task queue.
2847 
2848  for ( i = n_tasks - 1; i >= 0; i-- )
2849  {
2850  // Save all the task pointers.
2851  args->task_queue[i] = t;
2852 
2853  // Only use a single queue implementation.
2854  t->queue = 0;
2855 
2856  // Determine the height of each task in the DAG.
2857  height = 0;
2858  d = t->dep_arg_head;
2859 
2860  // Take the maximum height of dependent tasks.
2861  for ( j = 0; j < t->n_dep_args; j++ )
2862  {
2863  height = max( height, d->task->height );
2864  d = d->next_dep;
2865  }
2866 
2867  t->height = height + 1;
2868 
2869  // Since freeing a task is always a leaf, we want to force it to execute
2870  // earlier by giving it a greater height in order to reclaim memory.
2871  if ( t->func == (void *) FLA_Obj_free_buffer_task )
2872  t->height += n_tasks;
2873 
2874  // Find all ready tasks.
2875  t->n_ready += t->n_input_args + t->n_output_args +
2876  t->n_macro_args + t->n_war_args;
2877 
2878  if ( t->n_ready == 0 )
2879  {
2880  // Save the number of ready and available tasks.
2881  n_ready++;
2882  }
2883 
2884  if ( FLA_is_owner() )
2885  {
2886  // Record all the ready values.
2887  args->n_ready[i] = t->n_ready;
2888  }
2889 
2890  // Go to the previous task.
2891  t = t->prev_task;
2892  }
2893 
2894  // Only allow the first core to enqueue the initial ready tasks.
2895  if ( !FLA_is_owner() )
2896  return;
2897 
2898  // Grab the head of the task queue.
2900 
2901  for ( i = 0; i < n_tasks && n_ready > 0; i++ )
2902  {
2903  if ( t->n_ready == 0 )
2904  {
2905  RCCE_acquire_lock( 0 );
2906 
2907  // Enqueue all the ready and available tasks.
2908  FLASH_Queue_wait_enqueue( t, arg );
2909 
2910  RCCE_release_lock( 0 );
2911 
2912  // Decrement the number of ready tasks left to be enqueued.
2913  n_ready--;
2914  }
2915 
2916  // Go to the next task.
2917  t = t->next_task;
2918  }
2919 
2920  return;
2921 }
FLASH_Task * FLASH_Queue_get_head_task(void)
Definition: FLASH_Queue.c:603
int height
Definition: FLA_type_defs.h:191
int RCCE_release_lock(int)
int * n_ready
Definition: FLASH_Queue_exec.c:2742
FLASH_Dep * dep_arg_head
Definition: FLA_type_defs.h:232
FLASH_Task * next_task
Definition: FLA_type_defs.h:237
FLASH_Task ** task_queue
Definition: FLASH_Queue_exec.c:2739
Definition: FLA_type_defs.h:244
int n_ready
Definition: FLA_type_defs.h:186
Definition: FLA_type_defs.h:183
FLASH_Task * task
Definition: FLA_type_defs.h:247
FLASH_Dep * next_dep
Definition: FLA_type_defs.h:250
int n_dep_args
Definition: FLA_type_defs.h:231
FLA_Error FLA_Obj_free_buffer_task(FLA_Obj obj, void *cntl)
Definition: FLA_Obj_free_buffer_task.c:13
int n_input_args
Definition: FLA_type_defs.h:217
FLA_Bool FLA_is_owner(void)
Definition: FLA_Obj.c:33
int n_output_args
Definition: FLA_type_defs.h:221
int n_war_args
Definition: FLA_type_defs.h:228
int queue
Definition: FLA_type_defs.h:190
FLASH_Task * FLASH_Queue_get_tail_task(void)
Definition: FLASH_Queue.c:614
int n_macro_args
Definition: FLA_type_defs.h:225
Definition: FLASH_Queue_exec.c:54
FLASH_Task * prev_task
Definition: FLA_type_defs.h:236
void FLASH_Queue_wait_enqueue(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:626
unsigned int FLASH_Queue_get_num_tasks(void)
Definition: FLASH_Queue.c:284
int i
Definition: bl1_axmyv2.c:145
int RCCE_acquire_lock(int)
void * func
Definition: FLA_type_defs.h:197

◆ FLASH_Queue_invalidate_block_gpu()

void FLASH_Queue_invalidate_block_gpu ( FLA_Obj  obj,
int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_gpu(), and FLASH_Queue_update_gpu().

1850 {
1851  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1852  int j, k;
1853  dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1854  FLA_Obj_gpu gpu_obj;
1855 
1856 #ifdef FLA_ENABLE_MULTITHREADING
1857  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1858 #endif
1859 
1860  // Locate the position of the block on the GPU.
1861  for ( k = 0; k < gpu_n_blocks; k++ )
1862  if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1863  break;
1864 
1865  // The block is owned by other GPU.
1866  if ( k < gpu_n_blocks )
1867  {
1868  // Invalidate the block.
1869  args->gpu[thread * gpu_n_blocks + k].obj.base = NULL;
1870 
1871  args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
1872  args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1873 
1874  // Save the block that will be invalidated.
1875  gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1876 
1877  // Shift all the blocks for the invalidated block.
1878  for ( j = k; j < gpu_n_blocks - 1; j++ )
1879  args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j + 1];
1880 
1881  // Move to the LRU block.
1882  args->gpu[thread * gpu_n_blocks + gpu_n_blocks - 1] = gpu_obj;
1883  }
1884 
1885 #ifdef FLA_ENABLE_MULTITHREADING
1886  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1887 #endif
1888 
1889  return;
1890 }
FLA_Bool request
Definition: FLASH_Queue_exec.c:49
unsigned long dim_t
Definition: FLA_type_defs.h:71
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLA_Bool clean
Definition: FLASH_Queue_exec.c:46
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
Definition: FLASH_Queue_exec.c:54
Definition: FLASH_Queue_exec.c:37
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
FLA_Lock * gpu_lock
Definition: FLASH_Queue_exec.c:101
FLA_Obj obj
Definition: FLASH_Queue_exec.c:40

◆ FLASH_Queue_mark_gpu()

void FLASH_Queue_mark_gpu ( FLASH_Task t,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, i, FLASH_Task_s::n_output_args, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLA_Obj_gpu_struct::request, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec_gpu().

1793 {
1794  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1795  int i, j, k;
1796  int thread = t->thread;
1797  dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1798  FLA_Bool duplicate;
1799  FLA_Obj obj;
1800 
1801  // Mark all the output blocks on the GPU as dirty.
1802  for ( i = t->n_output_args - 1; i >= 0; i-- )
1803  {
1804  obj = t->output_arg[i];
1805 
1806  // Check for duplicate blocks.
1807  duplicate = FALSE;
1808 
1809  for ( j = 0; j < i && !duplicate; j++ )
1810  {
1811  if ( obj.base == t->output_arg[j].base )
1812  duplicate = TRUE;
1813  }
1814 
1815  // If the output block has not been processed before.
1816  if ( !duplicate )
1817  {
1818 #ifdef FLA_ENABLE_MULTITHREADING
1819  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1820 #endif
1821 
1822  // Locate the position of the block on the GPU.
1823  for ( k = 0; k < gpu_n_blocks; k++ )
1824  if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1825  break;
1826 
1827  if ( k < gpu_n_blocks )
1828  {
1829  // Change the bits for the new dirty block.
1830  args->gpu[thread * gpu_n_blocks + k].clean = FALSE;
1831  args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1832  }
1833 
1834 #ifdef FLA_ENABLE_MULTITHREADING
1835  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1836 #endif
1837  }
1838  }
1839 
1840  return;
1841 }
FLA_Bool request
Definition: FLASH_Queue_exec.c:49
unsigned long dim_t
Definition: FLA_type_defs.h:71
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
Definition: FLA_type_defs.h:158
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLA_Bool clean
Definition: FLASH_Queue_exec.c:46
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
int n_output_args
Definition: FLA_type_defs.h:221
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
Definition: FLASH_Queue_exec.c:54
int FLA_Bool
Definition: FLA_type_defs.h:46
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
FLA_Lock * gpu_lock
Definition: FLASH_Queue_exec.c:101
int i
Definition: bl1_axmyv2.c:145
int thread
Definition: FLA_type_defs.h:192
FLA_Obj obj
Definition: FLASH_Queue_exec.c:40

◆ FLASH_Queue_prefetch()

void FLASH_Queue_prefetch ( int  cache,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLASH_Queue_prefetch_block(), i, FLA_Obj_gpu_struct::obj, FLASH_Queue_variables::prefetch, and FLASH_Queue_variables::size.

Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

1030 {
1031  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1032  int i;
1033  int size = args->size;
1034  FLA_Obj obj;
1035 
1036  // Prefetch blocks in opposite order to maintain LRU.
1037  for ( i = size - 1; i >= 0; i-- )
1038  {
1039  obj = args->prefetch[i];
1040 
1041  // Only prefetch if it is a valid block.
1042  if ( obj.base != NULL )
1043  {
1044  // Prefetch the block.
1046 
1047  // Record the prefetched block in the cache.
1048  args->cache[cache * size + i] = obj;
1049  }
1050  }
1051 
1052  return;
1053 }
FLA_Obj * cache
Definition: FLASH_Queue_exec.c:86
void FLASH_Queue_prefetch_block(FLA_Obj obj)
Definition: FLASH_Queue_exec.c:1056
int size
Definition: FLASH_Queue_exec.c:83
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
Definition: FLA_type_defs.h:158
FLA_Obj * prefetch
Definition: FLASH_Queue_exec.c:89
Definition: FLASH_Queue_exec.c:54
int i
Definition: bl1_axmyv2.c:145

◆ FLASH_Queue_prefetch_block()

void FLASH_Queue_prefetch_block ( FLA_Obj  obj)

References FLA_Obj_datatype(), FLA_Obj_elem_size(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_cache_line_size(), i, scomplex::real, and dcomplex::real.

Referenced by FLASH_Queue_prefetch().

1062 {
1063  int i, inc;
1064  int line_size = FLASH_Queue_get_cache_line_size();
1065  int elem_size = FLA_Obj_elem_size( obj );
1066  int length = FLA_Obj_length( obj );
1067  int width = FLA_Obj_width( obj );
1068  FLA_Datatype datatype = FLA_Obj_datatype( obj );
1069 
1070  // Determine stride to prefetch block into cache.
1071  inc = line_size / elem_size;
1072 
1073  // Switch between the four different datatypes.
1074  switch ( datatype )
1075  {
1076  case FLA_FLOAT:
1077  {
1078  float *buffer = ( float * ) FLA_FLOAT_PTR( obj );
1079  float access;
1080 
1081  // Access each cache line of the block.
1082  for ( i = 0; i < length * width; i += inc )
1083  access = buffer[i];
1084 
1085  // Prevent dead code elimination.
1086  access += 1.0;
1087 
1088  break;
1089  }
1090  case FLA_DOUBLE:
1091  {
1092  double *buffer = ( double * ) FLA_DOUBLE_PTR( obj );
1093  double access;
1094 
1095  // Access each cache line of the block.
1096  for ( i = 0; i < length * width; i += inc )
1097  access = buffer[i];
1098 
1099  // Prevent dead code elimination.
1100  access += 1.0;
1101 
1102  break;
1103  }
1104  case FLA_COMPLEX:
1105  {
1106  scomplex *buffer = ( scomplex * ) FLA_COMPLEX_PTR( obj );
1107  scomplex access;
1108 
1109  // Access each cache line of the block.
1110  for ( i = 0; i < length * width; i += inc )
1111  access = buffer[i];
1112 
1113  // Prevent dead code elimination.
1114  access.real += 1.0;
1115 
1116  break;
1117  }
1118  case FLA_DOUBLE_COMPLEX:
1119  {
1120  dcomplex *buffer = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( obj );
1121  dcomplex access;
1122 
1123  // Access each cache line of the block.
1124  for ( i = 0; i < length * width; i += inc )
1125  access = buffer[i];
1126 
1127  // Prevent dead code elimination.
1128  access.real += 1.0;
1129 
1130  break;
1131  }
1132  case FLA_INT:
1133  {
1134  int *buffer = ( int * ) FLA_INT_PTR( obj );
1135  int access;
1136 
1137  // Access each cache line of the block.
1138  for ( i = 0; i < length * width; i += inc )
1139  access = buffer[i];
1140 
1141  // Prevent dead code elimination.
1142  access += 1.0;
1143 
1144  break;
1145  }
1146  default:
1147  // This default case should never execute.
1148  FLA_Check_error_code( FLA_INVALID_DATATYPE );
1149  }
1150 
1151  return;
1152 }
float real
Definition: blis_type_defs.h:134
double real
Definition: blis_type_defs.h:139
FLA_Datatype FLA_Obj_datatype(FLA_Obj obj)
Definition: FLA_Query.c:13
dim_t FLA_Obj_width(FLA_Obj obj)
Definition: FLA_Query.c:123
dim_t FLA_Obj_elem_size(FLA_Obj obj)
Definition: FLA_Query.c:95
dim_t FLASH_Queue_get_cache_line_size(void)
Definition: FLASH_Queue.c:524
Definition: blis_type_defs.h:132
int FLA_Datatype
Definition: FLA_type_defs.h:49
int i
Definition: bl1_axmyv2.c:145
dim_t FLA_Obj_length(FLA_Obj obj)
Definition: FLA_Query.c:116
Definition: blis_type_defs.h:137

◆ FLASH_Queue_push()

void FLASH_Queue_push ( void *  func,
void *  cntl,
char *  name,
FLA_Bool  enabled_gpu,
int  n_int_args,
int  n_fla_args,
int  n_input_args,
int  n_output_args,
  ... 
)

References FLA_Obj_view::base, FLASH_Task_s::fla_arg, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_push_input(), FLASH_Task_alloc(), i, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_macro_args, FLASH_Task_s::output_arg, FLASH_Task_s::queue, and FLA_Obj_struct::write_task.

639 {
640  int i;
641  va_list var_arg_list;
642  FLASH_Task* t;
643  FLA_Obj obj;
644 
645  // Allocate a new FLA_Task and populate its fields with appropriate values.
646  t = FLASH_Task_alloc( func, cntl, name, enabled_gpu,
647  n_int_args, n_fla_args,
648  n_input_args, n_output_args );
649 
650  // Initialize variable argument environment. In case you're wondering, the
651  // second argument in this macro invocation of va_start() is supposed to be
652  // the parameter that immediately preceeds the variable argument list
653  // (ie: the ... above ).
654  va_start( var_arg_list, n_output_args );
655 
656  // Extract the integer arguments.
657  for ( i = 0; i < n_int_args; i++ )
658  t->int_arg[i] = va_arg( var_arg_list, int );
659 
660  // Extract the FLA_Obj arguments.
661  for ( i = 0; i < n_fla_args; i++ )
662  t->fla_arg[i] = va_arg( var_arg_list, FLA_Obj );
663 
664  // Extract the input FLA_Obj arguments.
665  for ( i = 0; i < n_input_args; i++ )
666  {
667  obj = va_arg( var_arg_list, FLA_Obj );
668  t->input_arg[i] = obj;
669 
670  // Macroblock is used.
671  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
672  {
673  dim_t jj, kk;
674  dim_t m = FLA_Obj_length( obj );
675  dim_t n = FLA_Obj_width( obj );
676  dim_t cs = FLA_Obj_col_stride( obj );
677  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
678 
679  // Dependence analysis for each input block in macroblock.
680  for ( jj = 0; jj < n; jj++ )
681  for ( kk = 0; kk < m; kk++ )
682  FLASH_Queue_push_input( *( buf + jj * cs + kk ), t );
683 
684  // Set the number of blocks in the macroblock subtracted by one
685  // since we do not want to recount an operand for each n_input_arg.
686  t->n_macro_args += m * n - 1;
687  }
688  else // Regular block.
689  {
690  // Dependence analysis for input operand.
691  FLASH_Queue_push_input( obj, t );
692  }
693  }
694 
695  // Extract the output FLA_Obj arguments.
696  for ( i = 0; i < n_output_args; i++ )
697  {
698  obj = va_arg( var_arg_list, FLA_Obj );
699  t->output_arg[i] = obj;
700 
701  // Only assign data affinity to the first output block.
702  if ( i == 0 )
703  {
704  FLA_Obj buf = obj;
705 
706  // Use the top left block of the macroblock.
707  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
708  buf = *FLASH_OBJ_PTR_AT( obj );
709 
710  if ( buf.base->write_task == NULL )
711  t->queue = flash_queue_n_write_blocks;
712  else
713  t->queue = buf.base->write_task->queue;
714  }
715 
716  // Macroblock is used.
717  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
718  {
719  dim_t jj, kk;
720  dim_t m = FLA_Obj_length( obj );
721  dim_t n = FLA_Obj_width( obj );
722  dim_t cs = FLA_Obj_col_stride( obj );
723  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
724 
725  // Dependence analysis for each output block in macroblock.
726  for ( jj = 0; jj < n; jj++ )
727  for ( kk = 0; kk < m; kk++ )
728  FLASH_Queue_push_output( *( buf + jj * cs + kk ), t );
729 
730  // Set the number of blocks in the macroblock subtracted by one
731  // since we do not want to recount an operand for each n_output_arg.
732  t->n_macro_args += m * n - 1;
733  }
734  else // Regular block.
735  {
736  // Dependence analysis for output operand.
737  FLASH_Queue_push_output( obj, t );
738  }
739  }
740 
741  // Finalize the variable argument environment.
742  va_end( var_arg_list );
743 
744  // Add the task to the tail of the queue (and the head if queue is empty).
745  if ( _tq.n_tasks == 0 )
746  {
747  _tq.head = t;
748  _tq.tail = t;
749  }
750  else
751  {
752  t->prev_task = _tq.tail;
753  _tq.tail->next_task = t;
754  _tq.tail = t;
755 
756  // Determine the index of the task in the task queue.
757  t->order = t->prev_task->order + 1;
758  }
759 
760  // Increment the number of tasks.
761  _tq.n_tasks++;
762 
763  return;
764 }
int * int_arg
Definition: FLA_type_defs.h:210
unsigned long dim_t
Definition: FLA_type_defs.h:71
FLASH_Queue _tq
Definition: FLASH_Queue.c:27
FLASH_Task * next_task
Definition: FLA_type_defs.h:237
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
void FLASH_Queue_push_output(FLA_Obj obj, FLASH_Task *t)
Definition: FLASH_Queue.c:842
Definition: FLA_type_defs.h:183
FLASH_Task * tail
Definition: FLA_type_defs.h:180
Definition: FLA_type_defs.h:158
dim_t FLA_Obj_width(FLA_Obj obj)
Definition: FLA_Query.c:123
FLASH_Task * head
Definition: FLA_type_defs.h:179
int queue
Definition: FLA_type_defs.h:190
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
int n_macro_args
Definition: FLA_type_defs.h:225
int order
Definition: FLA_type_defs.h:189
FLASH_Task * prev_task
Definition: FLA_type_defs.h:236
FLASH_Task * write_task
Definition: FLA_type_defs.h:154
void FLASH_Queue_push_input(FLA_Obj obj, FLASH_Task *t)
Definition: FLASH_Queue.c:767
FLA_Obj * input_arg
Definition: FLA_type_defs.h:218
dim_t FLA_Obj_col_stride(FLA_Obj obj)
Definition: FLA_Query.c:174
int i
Definition: bl1_axmyv2.c:145
FLA_Obj * fla_arg
Definition: FLA_type_defs.h:214
unsigned int n_tasks
Definition: FLA_type_defs.h:176
dim_t FLA_Obj_length(FLA_Obj obj)
Definition: FLA_Query.c:116
FLASH_Task * FLASH_Task_alloc(void *func, void *cntl, char *name, FLA_Bool enabled_gpu, int n_int_args, int n_fla_args, int n_input_args, int n_output_args)
Definition: FLASH_Queue.c:956
FLA_Elemtype FLA_Obj_elemtype(FLA_Obj obj)
Definition: FLA_Query.c:51

◆ FLASH_Queue_push_input()

void FLASH_Queue_push_input ( FLA_Obj  obj,
FLASH_Task t 
)

References FLA_Obj_view::base, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Task_s::n_ready, and FLA_Obj_struct::write_task.

Referenced by FLASH_Queue_push().

774 {
775  FLASH_Task* task;
776  FLASH_Dep* d;
777 
778  // Find dependence information.
779  if ( obj.base->write_task == NULL )
780  {
781  t->n_ready--;
782 
783  // Add to number of blocks read if not written and not read before.
784  if ( obj.base->n_read_tasks == 0 )
785  {
786  // Identify each read block with an id for freeing.
787  obj.base->n_read_blocks = flash_queue_n_read_blocks;
788 
789  flash_queue_n_read_blocks++;
790  }
791  }
792  else
793  { // Flow dependence.
794  task = obj.base->write_task;
795 
796  d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
797 
798  d->task = t;
799  d->next_dep = NULL;
800 
801  if ( task->n_dep_args == 0 )
802  {
803  task->dep_arg_head = d;
804  task->dep_arg_tail = d;
805  }
806  else
807  {
808  task->dep_arg_tail->next_dep = d;
809  task->dep_arg_tail = d;
810  }
811 
812  task->n_dep_args++;
813  }
814 
815  // Add task to the read task in the object if not already there.
816  if ( obj.base->n_read_tasks == 0 ||
817  obj.base->read_task_tail->task != t )
818  { // Anti-dependence potentially.
819  d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
820 
821  d->task = t;
822  d->next_dep = NULL;
823 
824  if ( obj.base->n_read_tasks == 0 )
825  {
826  obj.base->read_task_head = d;
827  obj.base->read_task_tail = d;
828  }
829  else
830  {
831  obj.base->read_task_tail->next_dep = d;
832  obj.base->read_task_tail = d;
833  }
834 
835  obj.base->n_read_tasks++;
836  }
837 
838  return;
839 }
FLASH_Dep * dep_arg_head
Definition: FLA_type_defs.h:232
FLASH_Dep * read_task_tail
Definition: FLA_type_defs.h:151
Definition: FLA_type_defs.h:244
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
int n_ready
Definition: FLA_type_defs.h:186
int n_read_tasks
Definition: FLA_type_defs.h:149
FLASH_Dep * dep_arg_tail
Definition: FLA_type_defs.h:233
Definition: FLA_type_defs.h:183
FLASH_Task * task
Definition: FLA_type_defs.h:247
FLASH_Dep * next_dep
Definition: FLA_type_defs.h:250
int n_dep_args
Definition: FLA_type_defs.h:231
void * FLA_malloc(size_t size)
Definition: FLA_Memory.c:111
FLASH_Task * write_task
Definition: FLA_type_defs.h:154
int n_read_blocks
Definition: FLA_type_defs.h:145
FLASH_Dep * read_task_head
Definition: FLA_type_defs.h:150

◆ FLASH_Queue_push_output()

void FLASH_Queue_push_output ( FLA_Obj  obj,
FLASH_Task t 
)

References FLA_Obj_view::base, i, FLASH_Task_s::n_ready, FLA_Obj_struct::n_write_blocks, and FLA_Obj_struct::write_task.

849 {
850  int i;
851  FLASH_Task* task;
852  FLASH_Dep* d;
853  FLASH_Dep* next_dep;
854 
855  // Assign tasks to threads with data affinity.
856  if ( obj.base->write_task == NULL )
857  {
858  t->n_ready--;
859 
860  // Save index in which this output block is first encountered.
861  obj.base->n_write_blocks = flash_queue_n_write_blocks;
862 
863  // Number of blocks written if not written before.
864  flash_queue_n_write_blocks++;
865 
866  // Add to number of blocks read if not written or read before.
867  if ( obj.base->n_read_tasks == 0 )
868  {
869  // Identify each read block with an id for freeing.
870  obj.base->n_read_blocks = flash_queue_n_read_blocks;
871 
872  flash_queue_n_read_blocks++;
873  }
874  }
875  else
876  { // Flow dependence potentially.
877  // The last task to overwrite this block is not itself.
878  if ( obj.base->write_task != t )
879  {
880  // Create dependency from task that last wrote the block.
881  task = obj.base->write_task;
882 
883  d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
884 
885  d->task = t;
886  d->next_dep = NULL;
887 
888  if ( task->n_dep_args == 0 )
889  {
890  task->dep_arg_head = d;
891  task->dep_arg_tail = d;
892  }
893  else
894  {
895  task->dep_arg_tail->next_dep = d;
896  task->dep_arg_tail = d;
897  }
898 
899  task->n_dep_args++;
900  }
901  else
902  {
903  // No need to notify task twice for output block already seen.
904  t->n_ready--;
905  }
906  }
907 
908  // Clear read task for next set of reads and record the anti-dependence.
909  d = obj.base->read_task_head;
910 
911  for ( i = 0; i < obj.base->n_read_tasks; i++ )
912  {
913  task = d->task;
914  next_dep = d->next_dep;
915 
916  // If the last task to read is not the current task, add dependence.
917  if ( task != t )
918  {
919  d->task = t;
920  d->next_dep = NULL;
921 
922  if ( task->n_dep_args == 0 )
923  {
924  task->dep_arg_head = d;
925  task->dep_arg_tail = d;
926  }
927  else
928  {
929  task->dep_arg_tail->next_dep = d;
930  task->dep_arg_tail = d;
931  }
932 
933  task->n_dep_args++;
934 
935  t->n_war_args++;
936  }
937  else
938  {
939  FLA_free( d );
940  }
941 
942  d = next_dep;
943  }
944 
945  obj.base->n_read_tasks = 0;
946  obj.base->read_task_head = NULL;
947  obj.base->read_task_tail = NULL;
948 
949  // Record this task as the last to write to this block.
950  obj.base->write_task = t;
951 
952  return;
953 }
FLASH_Dep * dep_arg_head
Definition: FLA_type_defs.h:232
FLASH_Dep * read_task_tail
Definition: FLA_type_defs.h:151
Definition: FLA_type_defs.h:244
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
int n_ready
Definition: FLA_type_defs.h:186
int n_read_tasks
Definition: FLA_type_defs.h:149
FLASH_Dep * dep_arg_tail
Definition: FLA_type_defs.h:233
Definition: FLA_type_defs.h:183
FLASH_Task * task
Definition: FLA_type_defs.h:247
void FLA_free(void *ptr)
Definition: FLA_Memory.c:247
FLASH_Dep * next_dep
Definition: FLA_type_defs.h:250
int n_dep_args
Definition: FLA_type_defs.h:231
void * FLA_malloc(size_t size)
Definition: FLA_Memory.c:111
int n_war_args
Definition: FLA_type_defs.h:228
FLASH_Task * write_task
Definition: FLA_type_defs.h:154
int n_write_blocks
Definition: FLA_type_defs.h:146
int i
Definition: bl1_axmyv2.c:145
int n_read_blocks
Definition: FLA_type_defs.h:145
FLASH_Dep * read_task_head
Definition: FLA_type_defs.h:150

◆ FLASH_Queue_reset()

void FLASH_Queue_reset ( void  )

References FLASH_Queue_s::head, FLASH_Queue_s::n_tasks, and FLASH_Queue_s::tail.

Referenced by FLASH_Queue_exec().

589 {
590  // Clear the other fields of the FLASH_Queue structure.
591  _tq.n_tasks = 0;
592  _tq.head = NULL;
593  _tq.tail = NULL;
594 
595  // Reset the number of blocks.
596  flash_queue_n_read_blocks = 0;
597  flash_queue_n_write_blocks = 0;
598 
599  return;
600 }
FLASH_Queue _tq
Definition: FLASH_Queue.c:27
FLASH_Task * tail
Definition: FLA_type_defs.h:180
FLASH_Task * head
Definition: FLA_type_defs.h:179
unsigned int n_tasks
Definition: FLA_type_defs.h:176

◆ FLASH_Queue_set_block_size()

void FLASH_Queue_set_block_size ( dim_t  size)

Referenced by FLASH_Obj_create_hierarchy().

467 {
468  // Only adjust the block size if the new block is larger.
469  if ( flash_queue_block_size < size )
470  flash_queue_block_size = size;
471 
472  return;
473 }

◆ FLASH_Queue_set_cache_line_size()

void FLASH_Queue_set_cache_line_size ( dim_t  size)
517 {
518  flash_queue_cache_line_size = size;
519 
520  return;
521 }

◆ FLASH_Queue_set_cache_size()

void FLASH_Queue_set_cache_size ( dim_t  size)
493 {
494  flash_queue_cache_size = size;
495 
496  return;
497 }

◆ FLASH_Queue_set_caching()

void FLASH_Queue_set_caching ( FLA_Bool  caching)

Referenced by FLASH_Queue_exec().

349 {
350  flash_queue_caching = caching;
351 
352  return;
353 }

◆ FLASH_Queue_set_cores_per_cache()

void FLASH_Queue_set_cores_per_cache ( int  cores)
541 {
542  flash_queue_cores_per_cache = cores;
543 
544  return;
545 }

◆ FLASH_Queue_set_cores_per_queue()

void FLASH_Queue_set_cores_per_queue ( int  cores)
565 {
566  flash_queue_cores_per_queue = cores;
567 
568  return;
569 }

◆ FLASH_Queue_set_data_affinity()

void FLASH_Queue_set_data_affinity ( FLASH_Data_aff  data_affinity)

Referenced by FLASH_Queue_exec().

397 {
398  flash_queue_data_affinity = data_affinity;
399 
400  return;
401 }

◆ FLASH_Queue_set_num_threads()

void FLASH_Queue_set_num_threads ( unsigned int  n_threads)

References FLA_Check_num_threads().

193 {
194  FLA_Error e_val;
195 
196  // Verify that the number of threads is positive.
197  e_val = FLA_Check_num_threads( n_threads );
198  FLA_Check_error_code( e_val );
199 
200  // Keep track of the number of threads internally.
201  flash_queue_n_threads = n_threads;
202 
203 #if FLA_MULTITHREADING_MODEL == FLA_OPENMP
204 
205  // No additional action is necessary to set the number of OpenMP threads
206  // since setting the number of threads is handled at the parallel for loop
207  // with a num_threads() clause. This gives the user more flexibility since
208  // he can use the OMP_NUM_THREADS environment variable or the
209  // omp_set_num_threads() function to set the global number of OpenMP threads
210  // independently of the number of SuperMatrix threads.
211 
212 #elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
213 
214  // No additional action is necessary to set the number of pthreads
215  // since setting the number of threads is handled entirely on our end.
216 
217 #endif
218 
219  return;
220 }
FLA_Error FLA_Check_num_threads(unsigned int n_threads)
Definition: FLA_Check.c:884
int FLA_Error
Definition: FLA_type_defs.h:47

◆ FLASH_Queue_set_parallel_time()

void FLASH_Queue_set_parallel_time ( double  dtime)

Referenced by FLASH_Queue_exec().

454 {
455  flash_queue_parallel_time = dtime;
456 
457  return;
458 }

◆ FLASH_Queue_set_sorting()

void FLASH_Queue_set_sorting ( FLA_Bool  sorting)
325 {
326  flash_queue_sorting = sorting;
327 
328  return;
329 }

◆ FLASH_Queue_set_verbose_output()

void FLASH_Queue_set_verbose_output ( FLASH_Verbose  verbose)
301 {
302  flash_queue_verbose = verbose;
303 
304  return;
305 }

◆ FLASH_Queue_set_work_stealing()

void FLASH_Queue_set_work_stealing ( FLA_Bool  work_stealing)

Referenced by FLASH_Queue_exec().

373 {
374  flash_queue_work_stealing = work_stealing;
375 
376  return;
377 }

◆ FLASH_Queue_stack_depth()

unsigned int FLASH_Queue_stack_depth ( void  )

Referenced by FLASH_Eig_gest(), FLASH_LU_incpiv(), FLASH_QR_UT_inc(), FLASH_Queue_disable_gpu(), and FLASH_Queue_enable_gpu().

112 {
113  return flash_queue_stack;
114 }

◆ FLASH_Queue_update_block_gpu()

void FLASH_Queue_update_block_gpu ( FLA_Obj  obj,
void **  buffer_gpu,
int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_write_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.

Referenced by FLASH_Queue_update_gpu().

1704 {
1705  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1706  int j, k;
1707  dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1708  FLA_Bool transfer = FALSE;
1709  FLA_Bool evict = FALSE;
1710  FLA_Obj_gpu evict_obj;
1711  FLA_Obj_gpu gpu_obj;
1712 
1713 #ifdef FLA_ENABLE_MULTITHREADING
1714  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1715 #endif
1716 
1717  // Locate the position of the block on GPU.
1718  for ( k = 0; k < gpu_n_blocks - 1; k++ )
1719  if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1720  break;
1721 
1722  // Save the pointer to the data on the GPU.
1723  buffer_gpu[0] = args->gpu[thread * gpu_n_blocks + k].buffer_gpu;
1724 
1725  // Save the victim block.
1726  evict_obj = args->gpu[thread * gpu_n_blocks + k];
1727 
1728  // The block is not already in the GPU.
1729  if ( obj.base != args->gpu[thread * gpu_n_blocks + k].obj.base )
1730  {
1731  // Save for data transfer outside of critical section.
1732  transfer = TRUE;
1733 
1734  // Save for eviction outside of critical section.
1735  if ( evict_obj.obj.base != NULL && !evict_obj.clean )
1736  {
1737  evict = TRUE;
1738  args->victim[thread] = evict_obj;
1739  }
1740 
1741  // Save the block in the data structure.
1742  args->gpu[thread * gpu_n_blocks + k].obj = obj;
1743 
1744  // Make sure the new block is clean.
1745  args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
1746  args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1747  }
1748 
1749  // Use the block on the GPU that is a hit or LRU.
1750  gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1751 
1752  // Shift all the previous tasks for LRU replacement.
1753  for ( j = k; j > 0; j-- )
1754  args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j - 1];
1755 
1756  // Place the block on the cache as the most recently used.
1757  args->gpu[thread * gpu_n_blocks] = gpu_obj;
1758 
1759 #ifdef FLA_ENABLE_MULTITHREADING
1760  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1761 #endif
1762 
1763  // Evict and flush the LRU dirty block.
1764  if ( evict )
1765  {
1766  FLASH_Queue_read_gpu( evict_obj.obj, evict_obj.buffer_gpu );
1767 
1768 #ifdef FLA_ENABLE_MULTITHREADING
1769  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1770 #endif
1771 
1772  args->victim[thread].obj.base = NULL;
1773 
1774 #ifdef FLA_ENABLE_MULTITHREADING
1775  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1776 #endif
1777  }
1778 
1779  // Move the block to the GPU.
1780  if ( transfer )
1781  FLASH_Queue_write_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1782 
1783  return;
1784 }
FLA_Bool request
Definition: FLASH_Queue_exec.c:49
unsigned long dim_t
Definition: FLA_type_defs.h:71
FLA_Error FLASH_Queue_write_gpu(FLA_Obj obj, void *buffer_gpu)
Definition: FLASH_Queue_gpu.c:185
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
void * buffer_gpu
Definition: FLASH_Queue_exec.c:43
FLA_Error FLASH_Queue_read_gpu(FLA_Obj obj, void *buffer_gpu)
Definition: FLASH_Queue_gpu.c:205
FLA_Obj_gpu * victim
Definition: FLASH_Queue_exec.c:107
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLA_Bool clean
Definition: FLASH_Queue_exec.c:46
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
Definition: FLASH_Queue_exec.c:54
Definition: FLASH_Queue_exec.c:37
int FLA_Bool
Definition: FLA_type_defs.h:46
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
FLA_Lock * gpu_lock
Definition: FLASH_Queue_exec.c:101
FLA_Obj obj
Definition: FLASH_Queue_exec.c:40

◆ FLASH_Queue_update_cache()

void FLASH_Queue_update_cache ( FLASH_Task t,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Task_s::cache, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_update_cache_block(), i, FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_gpu_struct::obj, and FLASH_Task_s::output_arg.

Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

853 {
854  int i, j;
855  FLA_Bool duplicate;
856  FLA_Obj obj;
857 
858  if ( t == NULL )
859  return;
860 
861  // Updating the input blocks.
862  for ( i = t->n_input_args - 1; i >= 0; i-- )
863  {
864  // Check for duplicate blocks.
865  duplicate = FALSE;
866 
867  for ( j = 0; j < t->n_output_args && !duplicate; j++ )
868  {
869  if ( t->input_arg[i].base == t->output_arg[j].base )
870  duplicate = TRUE;
871  }
872 
873  for ( j = 0; j < i && !duplicate; j++ )
874  {
875  if ( t->input_arg[i].base == t->input_arg[j].base )
876  duplicate = TRUE;
877  }
878 
879  // If the input block has not been processed before.
880  if ( !duplicate )
881  {
882  obj = t->input_arg[i];
883 
884  // Macroblock is used.
885  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
886  {
887  dim_t jj, kk;
888  dim_t m = FLA_Obj_length( obj );
889  dim_t n = FLA_Obj_width( obj );
890  dim_t cs = FLA_Obj_col_stride( obj );
891  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
892 
893  // Dependence analysis for each input block in macroblock.
894  for ( jj = 0; jj < n; jj++ )
895  for ( kk = 0; kk < m; kk++ )
896  FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ),
897  t->cache, FALSE, arg );
898  }
899  else // Regular block.
900  {
901  FLASH_Queue_update_cache_block( obj, t->cache, FALSE, arg );
902  }
903  }
904  }
905 
906  // Updating the output blocks.
907  for ( i = t->n_output_args - 1; i >= 0; i-- )
908  {
909  // Check for duplicate blocks.
910  duplicate = FALSE;
911 
912  for ( j = 0; j < i && !duplicate; j++ )
913  {
914  if ( t->output_arg[i].base == t->output_arg[j].base )
915  duplicate = TRUE;
916  }
917 
918  // If the output block has not been processed before.
919  if ( !duplicate )
920  {
921  obj = t->output_arg[i];
922 
923  // Macroblock is used.
924  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
925  {
926  dim_t jj, kk;
927  dim_t m = FLA_Obj_length( obj );
928  dim_t n = FLA_Obj_width( obj );
929  dim_t cs = FLA_Obj_col_stride( obj );
930  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
931 
932  // Dependence analysis for each input block in macroblock.
933  for ( jj = 0; jj < n; jj++ )
934  for ( kk = 0; kk < m; kk++ )
935  FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ),
936  t->cache, TRUE, arg );
937  }
938  else // Regular block.
939  {
940  FLASH_Queue_update_cache_block( obj, t->cache, TRUE, arg );
941  }
942  }
943  }
944 
945  return;
946 }
unsigned long dim_t
Definition: FLA_type_defs.h:71
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
Definition: FLA_type_defs.h:158
dim_t FLA_Obj_width(FLA_Obj obj)
Definition: FLA_Query.c:123
int n_input_args
Definition: FLA_type_defs.h:217
int n_output_args
Definition: FLA_type_defs.h:221
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
void FLASH_Queue_update_cache_block(FLA_Obj obj, int cache, FLA_Bool output, void *arg)
Definition: FLASH_Queue_exec.c:949
FLA_Obj * input_arg
Definition: FLA_type_defs.h:218
int FLA_Bool
Definition: FLA_type_defs.h:46
dim_t FLA_Obj_col_stride(FLA_Obj obj)
Definition: FLA_Query.c:174
int i
Definition: bl1_axmyv2.c:145
int cache
Definition: FLA_type_defs.h:193
dim_t FLA_Obj_length(FLA_Obj obj)
Definition: FLA_Query.c:116
FLA_Elemtype FLA_Obj_elemtype(FLA_Obj obj)
Definition: FLA_Query.c:51

◆ FLASH_Queue_update_cache_block()

void FLASH_Queue_update_cache_block ( FLA_Obj  obj,
int  cache,
FLA_Bool  output,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Queue_variables::cac_lock, FLASH_Queue_variables::cache, FLA_Lock_acquire(), FLA_Lock_release(), i, FLASH_Queue_variables::n_caches, and FLASH_Queue_variables::size.

Referenced by FLASH_Queue_update_cache().

958 {
959  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
960  int i, j, k;
961  int n_caches = args->n_caches;
962  int size = args->size;
963 
964 #ifdef FLA_ENABLE_MULTITHREADING
965  FLA_Lock_acquire( &(args->cac_lock[cache]) ); // C ***
966 #endif
967 
968  // Locate the position of the block in the cache.
969  for ( k = 0; k < size - 1; k++ )
970  {
971  if ( obj.base == args->cache[cache * size + k].base )
972  break;
973  }
974 
975  // Shift all the previous tasks for LRU replacement.
976  for ( j = k; j > 0; j-- )
977  args->cache[cache * size + j] = args->cache[cache * size + j - 1];
978 
979  // Place the block on the cache as the most recently used.
980  args->cache[cache * size] = obj;
981 
982 #ifdef FLA_ENABLE_MULTITHREADING
983  FLA_Lock_release( &(args->cac_lock[cache]) ); // C ***
984 #endif
985 
986  // Write invalidate if updating with output block.
987  if ( output )
988  {
989  for ( i = 0; i < n_caches; i++ )
990  {
991  if ( i != cache )
992  {
993 #ifdef FLA_ENABLE_MULTITHREADING
994  FLA_Lock_acquire( &(args->cac_lock[i]) ); // C ***
995 #endif
996  // Locate the position of the block in the cache.
997  for ( k = 0; k < size; k++ )
998  {
999  if ( obj.base == args->cache[i * size + k].base )
1000  break;
1001  }
1002 
1003  // The block is owned by other thread.
1004  if ( k < size )
1005  {
1006  // Shift all the blocks for the invalidated block.
1007  for ( j = k; j < size - 1; j++ )
1008  args->cache[i * size + j] = args->cache[i * size + j + 1];
1009 
1010  // Invalidate the block.
1011  args->cache[i * size + size - 1].base = NULL;
1012  }
1013 #ifdef FLA_ENABLE_MULTITHREADING
1014  FLA_Lock_release( &(args->cac_lock[i]) ); // C ***
1015 #endif
1016  }
1017  }
1018  }
1019 
1020  return;
1021 }
FLA_Obj * cache
Definition: FLASH_Queue_exec.c:86
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
int size
Definition: FLASH_Queue_exec.c:83
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
FLA_Lock * cac_lock
Definition: FLASH_Queue_exec.c:74
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
int n_caches
Definition: FLASH_Queue_exec.c:80
Definition: FLASH_Queue_exec.c:54
int i
Definition: bl1_axmyv2.c:145

◆ FLASH_Queue_update_gpu()

void FLASH_Queue_update_gpu ( FLASH_Task t,
void **  input_arg,
void **  output_arg,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_update_block_gpu(), i, FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::output_arg, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec_gpu().

1599 {
1600  int i, j, k;
1601  int thread = t->thread;
1602  int n_threads = FLASH_Queue_get_num_threads();
1603  FLA_Bool duplicate;
1604 
1605  // None of the arguments can be macroblocks yet.
1606  // Complicating factor is copying macroblock to contiguous memory on GPU.
1607 
1608  // Bring the input arguments to the GPU.
1609  for ( i = t->n_input_args - 1; i >= 0; i-- )
1610  {
1611  // Check for duplicate blocks.
1612  duplicate = FALSE;
1613 
1614  for ( j = 0; j < t->n_output_args && !duplicate; j++ )
1615  {
1616  if ( t->input_arg[i].base == t->output_arg[j].base )
1617  duplicate = TRUE;
1618  }
1619 
1620  for ( j = 0; j < i && !duplicate; j++ )
1621  {
1622  if ( t->input_arg[i].base == t->input_arg[j].base )
1623  duplicate = TRUE;
1624  }
1625 
1626  // If the input block has not been processed before.
1627  if ( !duplicate )
1628  {
1629  FLASH_Queue_update_block_gpu( t->input_arg[i], input_arg + i, thread, arg );
1630  }
1631  else
1632  {
1633  input_arg[i] = NULL;
1634  }
1635  }
1636 
1637  // Bring the output arguments to the GPU.
1638  for ( i = t->n_output_args - 1; i >= 0; i-- )
1639  {
1640  // Check for duplicate blocks.
1641  duplicate = FALSE;
1642 
1643  for ( j = 0; j < i && !duplicate; j++ )
1644  {
1645  if ( t->output_arg[i].base == t->output_arg[j].base )
1646  duplicate = TRUE;
1647  }
1648 
1649  // If the output block has not been processed before.
1650  if ( !duplicate )
1651  {
1652  FLASH_Queue_update_block_gpu( t->output_arg[i], output_arg + i, thread, arg );
1653 
1654  // Invalidate output blocks on all other GPUs.
1655  for ( k = 0; k < n_threads; k++ )
1656  if ( k != thread )
1658  }
1659  else
1660  {
1661  output_arg[i] = NULL;
1662  }
1663  }
1664 
1665  // Check to see if there are any duplicates.
1666  for ( i = t->n_input_args - 1; i >= 0; i-- )
1667  {
1668  for ( j = 0; j < t->n_output_args && input_arg[i] == NULL; j++ )
1669  {
1670  if ( t->input_arg[i].base == t->output_arg[j].base )
1671  input_arg[i] = output_arg[j];
1672  }
1673 
1674  for ( j = 0; j < i && input_arg[i] == NULL; j++ )
1675  {
1676  if ( t->input_arg[i].base == t->input_arg[j].base )
1677  input_arg[i] = input_arg[j];
1678  }
1679  }
1680 
1681  // Check to see if there are any duplicates.
1682  for ( i = t->n_output_args - 1; i >= 0; i-- )
1683  {
1684  for ( j = 0; j < i && output_arg[i] == NULL; j++ )
1685  {
1686  if ( t->output_arg[i].base == t->output_arg[j].base )
1687  output_arg[i] = output_arg[j];
1688  }
1689  }
1690 
1691  return;
1692 }
void FLASH_Queue_invalidate_block_gpu(FLA_Obj obj, int thread, void *arg)
Definition: FLASH_Queue_exec.c:1844
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
int n_input_args
Definition: FLA_type_defs.h:217
int n_output_args
Definition: FLA_type_defs.h:221
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
FLA_Obj * input_arg
Definition: FLA_type_defs.h:218
int FLA_Bool
Definition: FLA_type_defs.h:46
void FLASH_Queue_update_block_gpu(FLA_Obj obj, void **buffer_gpu, int thread, void *arg)
Definition: FLASH_Queue_exec.c:1695
int i
Definition: bl1_axmyv2.c:145
int thread
Definition: FLA_type_defs.h:192
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223

◆ FLASH_Queue_verbose_output()

void FLASH_Queue_verbose_output ( void  )

References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Queue_get_data_affinity(), FLASH_Queue_get_head_task(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), i, FLA_Obj_struct::id, FLASH_Task_s::input_arg, FLA_Obj_struct::m_index, FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_index, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::name, FLASH_Dep_s::next_dep, FLASH_Task_s::next_task, FLASH_Task_s::order, FLASH_Task_s::output_arg, FLASH_Task_s::queue, and FLASH_Dep_s::task.

Referenced by FLASH_Queue_exec().

1788 {
1789  int i, j, k;
1790  int n_threads = FLASH_Queue_get_num_threads();
1791  int n_tasks = FLASH_Queue_get_num_tasks();
1793  FLASH_Task* t;
1794  FLASH_Dep* d;
1795 
1796  // Grab the head of the task queue.
1798 
1799  if ( verbose == FLASH_QUEUE_VERBOSE_READABLE )
1800  {
1801  // Iterate over linked list of tasks.
1802  for ( i = 0; i < n_tasks; i++ )
1803  {
1804  printf( "%d\t%s\t", t->order, t->name );
1805 
1806  for ( j = 0; j < t->n_output_args; j++ )
1807  printf( "%lu[%lu,%lu] ", t->output_arg[j].base->id,
1808  t->output_arg[j].base->m_index,
1809  t->output_arg[j].base->n_index );
1810 
1811  printf( ":= " );
1812 
1813  for ( j = 0; j < t->n_output_args; j++ )
1814  printf( "%lu[%lu,%lu] ", t->output_arg[j].base->id,
1815  t->output_arg[j].base->m_index,
1816  t->output_arg[j].base->n_index );
1817 
1818  for ( j = 0; j < t->n_input_args; j++ )
1819  printf( "%lu[%lu,%lu] ", t->input_arg[j].base->id,
1820  t->input_arg[j].base->m_index,
1821  t->input_arg[j].base->n_index );
1822 
1823  printf( "\n" );
1824 
1825  // Go to the next task.
1826  t = t->next_task;
1827  }
1828 
1829  printf( "\n" );
1830  }
1831  else
1832  {
1833  printf( "digraph SuperMatrix {\n" );
1834 
1835  if ( FLASH_Queue_get_data_affinity() == FLASH_QUEUE_AFFINITY_NONE )
1836  {
1837  // Iterate over linked list of tasks.
1838  for ( i = 0; i < n_tasks; i++ )
1839  {
1840  printf( "%d [label=\"%s\"]; %d -> {", t->order, t->name, t->order);
1841 
1842  d = t->dep_arg_head;
1843  for ( j = 0; j < t->n_dep_args; j++ )
1844  {
1845  printf( "%d;", d->task->order );
1846  d = d->next_dep;
1847  }
1848 
1849  printf( "};\n" );
1850 
1851  // Go to the next task.
1852  t = t->next_task;
1853  }
1854  }
1855  else
1856  {
1857  // Iterate over all the threads.
1858  for ( k = 0; k < n_threads; k++ )
1859  {
1860  printf( "subgraph cluster%d {\nlabel=\"%d\"\n", k, k );
1861 
1862  // Iterate over linked list of tasks.
1863  for ( i = 0; i < n_tasks; i++ )
1864  {
1865  if ( t->queue == k )
1866  printf( "%d [label=\"%s\"];\n", t->order, t->name );
1867 
1868  // Go to the next task.
1869  t = t->next_task;
1870  }
1871 
1872  printf( "}\n" );
1873 
1874  // Grab the head of the task queue.
1876  }
1877 
1878  // Iterate over linked list of tasks.
1879  for ( i = 0; i < n_tasks; i++ )
1880  {
1881  printf( "%d -> {", t->order );
1882 
1883  d = t->dep_arg_head;
1884  for ( j = 0; j < t->n_dep_args; j++ )
1885  {
1886  printf( "%d;", d->task->order );
1887  d = d->next_dep;
1888  }
1889 
1890  printf( "};\n" );
1891 
1892  // Go to the next task.
1893  t = t->next_task;
1894  }
1895  }
1896 
1897  printf( "}\n\n" );
1898  }
1899 
1900  return;
1901 }
FLASH_Dep * dep_arg_head
Definition: FLA_type_defs.h:232
FLASH_Task * next_task
Definition: FLA_type_defs.h:237
FLASH_Verbose FLASH_Queue_get_verbose_output(void)
Definition: FLASH_Queue.c:308
Definition: FLA_type_defs.h:244
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
dim_t n_index
Definition: FLA_type_defs.h:135
unsigned long id
Definition: FLA_type_defs.h:133
Definition: FLA_type_defs.h:183
int FLASH_Verbose
Definition: FLA_type_defs.h:113
FLASH_Task * task
Definition: FLA_type_defs.h:247
FLASH_Dep * next_dep
Definition: FLA_type_defs.h:250
int n_dep_args
Definition: FLA_type_defs.h:231
int n_input_args
Definition: FLA_type_defs.h:217
FLASH_Task * FLASH_Queue_get_head_task(void)
Definition: FLASH_Queue.c:603
int n_output_args
Definition: FLA_type_defs.h:221
int queue
Definition: FLA_type_defs.h:190
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
int order
Definition: FLA_type_defs.h:189
FLA_Obj * input_arg
Definition: FLA_type_defs.h:218
int i
Definition: bl1_axmyv2.c:145
FLASH_Data_aff FLASH_Queue_get_data_affinity(void)
Definition: FLASH_Queue.c:404
dim_t m_index
Definition: FLA_type_defs.h:134
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223
unsigned int FLASH_Queue_get_num_tasks(void)
Definition: FLASH_Queue.c:284
char * name
Definition: FLA_type_defs.h:203

◆ FLASH_Queue_wait_dequeue()

FLASH_Task * FLASH_Queue_wait_dequeue ( int  queue,
int  cache,
void *  arg 
)

References FLASH_Queue_variables::cac_lock, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_caching(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_wait_dequeue_block(), FLASH_Queue_variables::gpu_lock, FLASH_Queue_s::head, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLASH_Task_s::next_wait, FLASH_Queue_variables::pc, FLASH_Task_s::prev_wait, FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), and FLASH_Task_update_dependencies().

2962 {
2963  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2964  FLASH_Task* t = NULL;
2965 
2966  if ( args->n_wait[0] > 0 )
2967  {
2968  // Grab the head of the queue.
2969  t = args->task_queue[args->wait_queue[args->pc[0]]];
2970 
2971  // Decrement number of tasks on waiting queue.
2972  args->n_wait[0]--;
2973 
2974  // Increment the program counter.
2975  args->pc[0]++;
2976  }
2977 
2978  return t;
2979 }
int pc
Definition: FLASH_Queue_exec.c:96
FLASH_Task ** task_queue
Definition: FLASH_Queue_exec.c:2739
Definition: FLA_type_defs.h:183
Definition: FLASH_Queue_exec.c:54
FLASH_Queue * wait_queue
Definition: FLASH_Queue_exec.c:92
int * n_wait
Definition: FLASH_Queue_exec.c:2748

◆ FLASH_Queue_wait_dequeue_block()

FLASH_Task* FLASH_Queue_wait_dequeue_block ( int  queue,
int  cache,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLA_Obj_elemtype(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_s::head, FLASH_Task_s::hit, i, FLASH_Task_s::n_output_args, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_wait, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLASH_Queue_variables::size, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_wait_dequeue().

785 {
786  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
787  int i, j, k;
788  int size = args->size;
789  int n_tasks = args->wait_queue[queue].n_tasks;
790  FLA_Bool enabled = FALSE;
791  FLASH_Task* t;
792  FLA_Obj obj;
793  FLA_Obj mem;
794 
795 #ifdef FLA_ENABLE_GPU
796  enabled = FLASH_Queue_get_enabled_gpu();
797 
798  // If using GPUs, then only check GPU and not the cache.
799  if ( enabled )
801 #endif
802 
803  t = args->wait_queue[queue].head;
804 
805  // Check if any of the output blocks are in the cache.
806  for ( i = 0; i < n_tasks; i++ )
807  {
808  for ( j = 0; j < size; j++ )
809  {
810  // Initialize the memory just in case.
811  mem.base = NULL;
812 
813  // Determine if using GPU or not.
814  if ( enabled )
815  {
816 #ifdef FLA_ENABLE_GPU
817  mem = args->gpu[cache * size + j].obj;
818 #endif
819  }
820  else
821  {
822  mem = args->cache[cache * size + j];
823  }
824 
825  for ( k = 0; k < t->n_output_args; k++ )
826  {
827  obj = t->output_arg[k];
828 
829  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
830  obj = *FLASH_OBJ_PTR_AT( obj );
831 
832  // Return the task if its output block is in cache.
833  if ( mem.base == obj.base )
834  {
835  t->hit = TRUE;
836  return t;
837  }
838  }
839  }
840  t = t->next_wait;
841  }
842 
843  return args->wait_queue[queue].head;
844 }
FLA_Obj * cache
Definition: FLASH_Queue_exec.c:86
int size
Definition: FLASH_Queue_exec.c:83
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
FLA_Bool FLASH_Queue_get_enabled_gpu(void)
Definition: FLASH_Queue_gpu.c:91
Definition: FLA_type_defs.h:183
FLASH_Task * next_wait
Definition: FLA_type_defs.h:241
Definition: FLA_type_defs.h:158
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLASH_Task * head
Definition: FLA_type_defs.h:179
int n_output_args
Definition: FLA_type_defs.h:221
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
FLA_Bool hit
Definition: FLA_type_defs.h:194
Definition: FLASH_Queue_exec.c:54
FLASH_Queue * wait_queue
Definition: FLASH_Queue_exec.c:92
int FLA_Bool
Definition: FLA_type_defs.h:46
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
int i
Definition: bl1_axmyv2.c:145
unsigned int n_tasks
Definition: FLA_type_defs.h:176
FLA_Elemtype FLA_Obj_elemtype(FLA_Obj obj)
Definition: FLA_Query.c:51
FLA_Obj obj
Definition: FLASH_Queue_exec.c:40

◆ FLASH_Queue_wait_enqueue()

void FLASH_Queue_wait_enqueue ( FLASH_Task t,
void *  arg 
)

References FLASH_Queue_get_sorting(), FLASH_Queue_s::head, FLASH_Task_s::height, i, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLASH_Task_s::next_wait, FLASH_Task_s::order, FLASH_Queue_variables::pc, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_exec_gpu(), FLASH_Queue_exec_simulation(), FLASH_Queue_init_tasks(), FLASH_Task_update_binding(), and FLASH_Task_update_dependencies().

2930 {
2931  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2932  int i = args->n_wait[0] + args->pc[0];
2933 
2934  // Insertion sort of tasks in waiting queue.
2935  if ( FLASH_Queue_get_sorting() )
2936  {
2937  for ( ; i > args->pc[0]; i-- )
2938  {
2939  if ( args->task_queue[args->wait_queue[i-1]]->height >
2940  args->task_queue[t->order]->height )
2941  break;
2942 
2943  args->wait_queue[i] = args->wait_queue[i-1];
2944  }
2945  }
2946 
2947  args->wait_queue[i] = t->order;
2948 
2949  // Increment number of tasks on waiting queue.
2950  args->n_wait[0]++;
2951 
2952  return;
2953 }
int height
Definition: FLA_type_defs.h:191
FLA_Bool FLASH_Queue_get_sorting(void)
Definition: FLASH_Queue.c:332
int pc
Definition: FLASH_Queue_exec.c:96
FLASH_Task ** task_queue
Definition: FLASH_Queue_exec.c:2739
int order
Definition: FLA_type_defs.h:189
Definition: FLASH_Queue_exec.c:54
FLASH_Queue * wait_queue
Definition: FLASH_Queue_exec.c:92
int i
Definition: bl1_axmyv2.c:145
int * n_wait
Definition: FLASH_Queue_exec.c:2748

◆ FLASH_Queue_work_stealing()

FLASH_Task* FLASH_Queue_work_stealing ( int  queue,
void *  arg 
)

References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_s::head, FLASH_Queue_variables::n_queues, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_wait, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, FLASH_Queue_variables::run_lock, FLASH_Queue_s::tail, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_exec_parallel_function().

1161 {
1162  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1163  int q;
1164  int n_queues = args->n_queues;
1165  FLASH_Task* t = NULL;
1166 
1167  // Do not perform work stealing if there is only one queue.
1168  if ( n_queues == 1 )
1169  return t;
1170 
1171  // Find a random queue not equal to the current queue.
1172  do
1173  {
1174 #ifdef FLA_ENABLE_WINDOWS_BUILD
1175  rand_s( &q );
1176  q = q % n_queues;
1177 #else
1178 #ifdef FLA_ENABLE_TIDSP
1179  q = rand() % n_queues;
1180 #else
1181  q = lrand48() % n_queues;
1182 #endif
1183 #endif
1184  }
1185  while ( q == queue );
1186 
1187 #ifdef FLA_ENABLE_MULTITHREADING
1188  FLA_Lock_acquire( &(args->run_lock[q]) ); // R ***
1189 #endif
1190 
1191  // If there are tasks that this thread can steal.
1192  if ( args->wait_queue[q].n_tasks > 0 )
1193  {
1194  // Dequeue the last task.
1195  t = args->wait_queue[q].tail;
1196 
1197  if ( args->wait_queue[q].n_tasks == 1 )
1198  {
1199  // Clear the queue of its only task.
1200  args->wait_queue[q].head = NULL;
1201  args->wait_queue[q].tail = NULL;
1202  }
1203  else
1204  {
1205  // Adjust pointers in waiting queue.
1206  args->wait_queue[q].tail = t->prev_wait;
1207  args->wait_queue[q].tail->next_wait = NULL;
1208  }
1209 
1210  // Reset waiting queue data about the stolen task.
1211  t->queue = queue;
1212  t->prev_wait = NULL;
1213  t->next_wait = NULL;
1214 
1215  args->wait_queue[q].n_tasks--;
1216  }
1217 
1218 #ifdef FLA_ENABLE_MULTITHREADING
1219  FLA_Lock_release( &(args->run_lock[q]) ); // R ***
1220 #endif
1221 
1222  return t;
1223 }
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLASH_Task * prev_wait
Definition: FLA_type_defs.h:240
Definition: FLA_type_defs.h:183
FLASH_Task * next_wait
Definition: FLA_type_defs.h:241
FLASH_Task * tail
Definition: FLA_type_defs.h:180
FLA_Lock * run_lock
Definition: FLASH_Queue_exec.c:62
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
FLASH_Task * head
Definition: FLA_type_defs.h:179
int queue
Definition: FLA_type_defs.h:190
Definition: FLASH_Queue_exec.c:54
FLASH_Queue * wait_queue
Definition: FLASH_Queue_exec.c:92
unsigned int n_tasks
Definition: FLA_type_defs.h:176
int n_queues
Definition: FLASH_Queue_exec.c:77

◆ FLASH_Task_alloc()

FLASH_Task* FLASH_Task_alloc ( void *  func,
void *  cntl,
char *  name,
FLA_Bool  enabled_gpu,
int  n_int_args,
int  n_fla_args,
int  n_input_args,
int  n_output_args 
)

References FLASH_Task_s::cache, FLASH_Task_s::cntl, FLASH_Task_s::dep_arg_head, FLASH_Task_s::dep_arg_tail, FLASH_Task_s::enabled_gpu, FLASH_Task_s::fla_arg, FLA_malloc(), FLASH_Task_s::func, FLASH_Task_s::height, FLASH_Task_s::hit, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_fla_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_int_args, FLASH_Task_s::n_macro_args, FLASH_Task_s::n_output_args, FLASH_Task_s::n_ready, FLASH_Task_s::n_war_args, FLASH_Task_s::name, FLASH_Task_s::next_task, FLASH_Task_s::next_wait, FLASH_Task_s::order, FLASH_Task_s::output_arg, FLASH_Task_s::prev_task, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_push().

969 {
970  FLASH_Task* t;
971 
972  // Allocate space for the task structure t.
973  t = (FLASH_Task *) FLA_malloc( sizeof(FLASH_Task) );
974 
975  // Allocate space for the task's integer arguments.
976  t->int_arg = (int *) FLA_malloc( n_int_args * sizeof(int) );
977 
978  // Allocate space for the task's FLA_Obj arguments.
979  t->fla_arg = (FLA_Obj *) FLA_malloc( n_fla_args * sizeof(FLA_Obj) );
980 
981  // Allocate space for the task's input FLA_Obj arguments.
982  t->input_arg = (FLA_Obj *) FLA_malloc( n_input_args * sizeof(FLA_Obj) );
983 
984  // Allocate space for the task's output FLA_Obj arguments.
985  t->output_arg = (FLA_Obj *) FLA_malloc( n_output_args * sizeof(FLA_Obj) );
986 
987  // Initialize other fields of the structure.
988  t->n_ready = 0;
989  t->order = 0;
990  t->queue = 0;
991  t->height = 0;
992  t->thread = 0;
993  t->cache = 0;
994  t->hit = FALSE;
995 
996  t->func = func;
997  t->cntl = cntl;
998  t->name = name;
999  t->enabled_gpu = enabled_gpu;
1000  t->n_int_args = n_int_args;
1001  t->n_fla_args = n_fla_args;
1002  t->n_input_args = n_input_args;
1003  t->n_output_args = n_output_args;
1004 
1005  t->n_macro_args = 0;
1006  t->n_war_args = 0;
1007  t->n_dep_args = 0;
1008  t->dep_arg_head = NULL;
1009  t->dep_arg_tail = NULL;
1010  t->prev_task = NULL;
1011  t->next_task = NULL;
1012  t->prev_wait = NULL;
1013  t->next_wait = NULL;
1014 
1015  // Return a pointer to the initialized structure.
1016  return t;
1017 }
int height
Definition: FLA_type_defs.h:191
int * int_arg
Definition: FLA_type_defs.h:210
FLASH_Dep * dep_arg_head
Definition: FLA_type_defs.h:232
FLASH_Task * next_task
Definition: FLA_type_defs.h:237
FLASH_Task * prev_wait
Definition: FLA_type_defs.h:240
int n_fla_args
Definition: FLA_type_defs.h:213
int n_ready
Definition: FLA_type_defs.h:186
void * cntl
Definition: FLA_type_defs.h:200
FLASH_Dep * dep_arg_tail
Definition: FLA_type_defs.h:233
Definition: FLA_type_defs.h:183
FLA_Bool enabled_gpu
Definition: FLA_type_defs.h:206
FLASH_Task * next_wait
Definition: FLA_type_defs.h:241
Definition: FLA_type_defs.h:158
int n_int_args
Definition: FLA_type_defs.h:209
int n_dep_args
Definition: FLA_type_defs.h:231
int n_input_args
Definition: FLA_type_defs.h:217
int n_output_args
Definition: FLA_type_defs.h:221
void * FLA_malloc(size_t size)
Definition: FLA_Memory.c:111
int n_war_args
Definition: FLA_type_defs.h:228
int queue
Definition: FLA_type_defs.h:190
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
int n_macro_args
Definition: FLA_type_defs.h:225
int order
Definition: FLA_type_defs.h:189
FLA_Bool hit
Definition: FLA_type_defs.h:194
FLASH_Task * prev_task
Definition: FLA_type_defs.h:236
FLA_Obj * input_arg
Definition: FLA_type_defs.h:218
int cache
Definition: FLA_type_defs.h:193
int thread
Definition: FLA_type_defs.h:192
FLA_Obj * fla_arg
Definition: FLA_type_defs.h:214
void * func
Definition: FLA_type_defs.h:197
char * name
Definition: FLA_type_defs.h:203

◆ FLASH_Task_free()

void FLASH_Task_free ( FLASH_Task t)

References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::fla_arg, FLA_free(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), i, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_struct::n_read_tasks, FLASH_Dep_s::next_dep, FLASH_Task_s::output_arg, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, and FLA_Obj_struct::write_task.

Referenced by FLASH_Queue_exec(), and FLASH_Queue_exec_simulation().

1026 {
1027  int i, j, k;
1028  FLA_Obj obj;
1029  FLASH_Dep* d;
1030  FLASH_Dep* next_dep;
1031 
1032  // Clearing the last write task in each output block.
1033  for ( i = 0; i < t->n_output_args; i++ )
1034  {
1035  obj = t->output_arg[i];
1036 
1037  // Macroblock is used.
1038  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1039  {
1040  dim_t jj, kk;
1041  dim_t m = FLA_Obj_length( obj );
1042  dim_t n = FLA_Obj_width( obj );
1043  dim_t cs = FLA_Obj_col_stride( obj );
1044  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
1045 
1046  // Clear each block in macroblock.
1047  for ( jj = 0; jj < n; jj++ )
1048  for ( kk = 0; kk < m; kk++ )
1049  ( buf + jj * cs + kk )->base->write_task = NULL;
1050  }
1051  else // Clear regular block.
1052  {
1053  obj.base->write_task = NULL;
1054  }
1055  }
1056 
1057  // Cleaning the last read tasks in each input block.
1058  for ( i = 0; i < t->n_input_args; i++ )
1059  {
1060  obj = t->input_arg[i];
1061 
1062  // Macroblock is used.
1063  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1064  {
1065  dim_t jj, kk;
1066  dim_t m = FLA_Obj_length( obj );
1067  dim_t n = FLA_Obj_width( obj );
1068  dim_t cs = FLA_Obj_col_stride( obj );
1069  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
1070 
1071  // Clear each block in macroblock.
1072  for ( jj = 0; jj < n; jj++ )
1073  {
1074  for ( kk = 0; kk < m; kk++ )
1075  {
1076  obj = *( buf + jj * cs + kk );
1077 
1078  k = obj.base->n_read_tasks;
1079  d = obj.base->read_task_head;
1080 
1081  obj.base->n_read_tasks = 0;
1082  obj.base->read_task_head = NULL;
1083  obj.base->read_task_tail = NULL;
1084 
1085  for ( j = 0; j < k; j++ )
1086  {
1087  next_dep = d->next_dep;
1088  FLA_free( d );
1089  d = next_dep;
1090  }
1091  }
1092  }
1093  }
1094  else // Regular block.
1095  {
1096  k = obj.base->n_read_tasks;
1097  d = obj.base->read_task_head;
1098 
1099  obj.base->n_read_tasks = 0;
1100  obj.base->read_task_head = NULL;
1101  obj.base->read_task_tail = NULL;
1102 
1103  for ( j = 0; j < k; j++ )
1104  {
1105  next_dep = d->next_dep;
1106  FLA_free( d );
1107  d = next_dep;
1108  }
1109  }
1110  }
1111 
1112  // Free the dep_arg field of t.
1113  d = t->dep_arg_head;
1114 
1115  for ( i = 0; i < t->n_dep_args; i++ )
1116  {
1117  next_dep = d->next_dep;
1118  FLA_free( d );
1119  d = next_dep;
1120  }
1121 
1122  // Free the int_arg field of t.
1123  FLA_free( t->int_arg );
1124 
1125  // Free the fla_arg field of t.
1126  FLA_free( t->fla_arg );
1127 
1128  // Free the input_arg field of t.
1129  FLA_free( t->input_arg );
1130 
1131  // Free the output_arg field of t.
1132  FLA_free( t->output_arg );
1133 
1134  // Finally, free the struct itself.
1135  FLA_free( t );
1136 
1137  return;
1138 }
int * int_arg
Definition: FLA_type_defs.h:210
FLASH_Dep * dep_arg_head
Definition: FLA_type_defs.h:232
unsigned long dim_t
Definition: FLA_type_defs.h:71
FLASH_Dep * read_task_tail
Definition: FLA_type_defs.h:151
Definition: FLA_type_defs.h:244
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
int n_read_tasks
Definition: FLA_type_defs.h:149
Definition: FLA_type_defs.h:158
dim_t FLA_Obj_width(FLA_Obj obj)
Definition: FLA_Query.c:123
void FLA_free(void *ptr)
Definition: FLA_Memory.c:247
FLASH_Dep * next_dep
Definition: FLA_type_defs.h:250
int n_dep_args
Definition: FLA_type_defs.h:231
int n_input_args
Definition: FLA_type_defs.h:217
int n_output_args
Definition: FLA_type_defs.h:221
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
FLASH_Task * write_task
Definition: FLA_type_defs.h:154
FLA_Obj * input_arg
Definition: FLA_type_defs.h:218
dim_t FLA_Obj_col_stride(FLA_Obj obj)
Definition: FLA_Query.c:174
int i
Definition: bl1_axmyv2.c:145
FLA_Obj * fla_arg
Definition: FLA_type_defs.h:214
dim_t FLA_Obj_length(FLA_Obj obj)
Definition: FLA_Query.c:116
FLA_Elemtype FLA_Obj_elemtype(FLA_Obj obj)
Definition: FLA_Query.c:51
FLASH_Dep * read_task_head
Definition: FLA_type_defs.h:150

◆ FLASH_Task_free_parallel()

void FLASH_Task_free_parallel ( FLASH_Task t,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::fla_arg, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_num_threads(), i, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Dep_s::next_dep, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Queue_variables::war_lock, and FLA_Obj_struct::write_task.

Referenced by FLASH_Queue_exec_parallel_function().

2452 {
2453  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2454  int i, j, k;
2455  int thread;
2456  int n_threads = FLASH_Queue_get_num_threads();
2457  FLASH_Dep* d;
2458  FLASH_Dep* next_dep;
2459  FLA_Obj obj;
2460 
2461  // Clearing the last write task in each output block.
2462  for ( i = 0; i < t->n_output_args; i++ )
2463  {
2464  obj = t->output_arg[i];
2465 
2466  // Macroblock is used.
2467  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
2468  {
2469  dim_t jj, kk;
2470  dim_t m = FLA_Obj_length( obj );
2471  dim_t n = FLA_Obj_width( obj );
2472  dim_t cs = FLA_Obj_col_stride( obj );
2473  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
2474 
2475  // Clear each block in macroblock.
2476  for ( jj = 0; jj < n; jj++ )
2477  for ( kk = 0; kk < m; kk++ )
2478  ( buf + jj * cs + kk )->base->write_task = NULL;
2479  }
2480  else // Clear regular block.
2481  {
2482  obj.base->write_task = NULL;
2483  }
2484  }
2485 
2486  // Cleaning the last read tasks in each input block.
2487  for ( i = 0; i < t->n_input_args; i++ )
2488  {
2489  obj = t->input_arg[i];
2490 
2491  // Macroblock is used.
2492  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
2493  {
2494  dim_t jj, kk;
2495  dim_t m = FLA_Obj_length( obj );
2496  dim_t n = FLA_Obj_width( obj );
2497  dim_t cs = FLA_Obj_col_stride( obj );
2498  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
2499 
2500  // Clear each block in macroblock.
2501  for ( jj = 0; jj < n; jj++ )
2502  {
2503  for ( kk = 0; kk < m; kk++ )
2504  {
2505  obj = *( buf + jj * cs + kk );
2506 
2507  thread = obj.base->n_read_blocks % n_threads;
2508 
2509  FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***
2510 
2511  k = obj.base->n_read_tasks;
2512  d = obj.base->read_task_head;
2513 
2514  obj.base->n_read_tasks = 0;
2515  obj.base->read_task_head = NULL;
2516  obj.base->read_task_tail = NULL;
2517 
2518  FLA_Lock_release( &(args->war_lock[thread]) ); // W ***
2519 
2520  for ( j = 0; j < k; j++ )
2521  {
2522  next_dep = d->next_dep;
2523  FLA_free( d );
2524  d = next_dep;
2525  }
2526  }
2527  }
2528  }
2529  else // Regular block.
2530  {
2531  thread = obj.base->n_read_blocks % n_threads;
2532 
2533  FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***
2534 
2535  k = obj.base->n_read_tasks;
2536  d = obj.base->read_task_head;
2537 
2538  obj.base->n_read_tasks = 0;
2539  obj.base->read_task_head = NULL;
2540  obj.base->read_task_tail = NULL;
2541 
2542  FLA_Lock_release( &(args->war_lock[thread]) ); // W ***
2543 
2544  for ( j = 0; j < k; j++ )
2545  {
2546  next_dep = d->next_dep;
2547  FLA_free( d );
2548  d = next_dep;
2549  }
2550  }
2551  }
2552 
2553  // Free the dep_arg field of t.
2554  d = t->dep_arg_head;
2555 
2556  for ( i = 0; i < t->n_dep_args; i++ )
2557  {
2558  next_dep = d->next_dep;
2559  FLA_free( d );
2560  d = next_dep;
2561  }
2562 
2563  // Free the int_arg field of t.
2564  FLA_free( t->int_arg );
2565 
2566  // Free the fla_arg field of t.
2567  FLA_free( t->fla_arg );
2568 
2569  // Free the input_arg field of t.
2570  FLA_free( t->input_arg );
2571 
2572  // Free the output_arg field of t.
2573  FLA_free( t->output_arg );
2574 
2575  // Finally, free the struct itself.
2576  FLA_free( t );
2577 
2578  return;
2579 }
int * int_arg
Definition: FLA_type_defs.h:210
FLASH_Dep * dep_arg_head
Definition: FLA_type_defs.h:232
unsigned long dim_t
Definition: FLA_type_defs.h:71
FLASH_Dep * read_task_tail
Definition: FLA_type_defs.h:151
FLA_Lock * war_lock
Definition: FLASH_Queue_exec.c:70
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
Definition: FLA_type_defs.h:244
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
int n_read_tasks
Definition: FLA_type_defs.h:149
Definition: FLA_type_defs.h:158
dim_t FLA_Obj_width(FLA_Obj obj)
Definition: FLA_Query.c:123
void FLA_free(void *ptr)
Definition: FLA_Memory.c:247
FLASH_Dep * next_dep
Definition: FLA_type_defs.h:250
int n_dep_args
Definition: FLA_type_defs.h:231
int n_input_args
Definition: FLA_type_defs.h:217
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
int n_output_args
Definition: FLA_type_defs.h:221
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
Definition: FLASH_Queue_exec.c:54
FLASH_Task * write_task
Definition: FLA_type_defs.h:154
FLA_Obj * input_arg
Definition: FLA_type_defs.h:218
dim_t FLA_Obj_col_stride(FLA_Obj obj)
Definition: FLA_Query.c:174
int i
Definition: bl1_axmyv2.c:145
int n_read_blocks
Definition: FLA_type_defs.h:145
FLA_Obj * fla_arg
Definition: FLA_type_defs.h:214
dim_t FLA_Obj_length(FLA_Obj obj)
Definition: FLA_Query.c:116
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223
FLA_Elemtype FLA_Obj_elemtype(FLA_Obj obj)
Definition: FLA_Query.c:51
FLASH_Dep * read_task_head
Definition: FLA_type_defs.h:150

◆ FLASH_Task_update_binding()

FLASH_Task* FLASH_Task_update_binding ( FLASH_Task t,
FLASH_Task r,
void *  arg 
)

References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_sorting(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::height, FLASH_Task_s::hit, FLASH_Task_s::queue, and FLASH_Queue_variables::run_lock.

Referenced by FLASH_Task_update_dependencies().

2401 {
2402  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2403  int queue;
2404 
2405  if ( r == NULL )
2406  {
2407  // There are no tasks on waiting queue, so bind the first task.
2408  r = t;
2409  r->hit = TRUE;
2410  }
2411  else
2412  {
2413  // Swap the binded task for the new ready task.
2414  if ( !r->hit || ( FLASH_Queue_get_sorting() && r->height < t->height ) )
2415  {
2416  queue = r->queue;
2417  r->hit = FALSE;
2418 
2419  FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2420 
2421  // Place swapped task back onto waiting queue.
2422  FLASH_Queue_wait_enqueue( r, arg );
2423 
2424  FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2425 
2426  // Bind the new ready task.
2427  r = t;
2428  r->hit = TRUE;
2429  }
2430  else // Keep the binded task and enqueue new ready task.
2431  {
2432  queue = t->queue;
2433 
2434  FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2435 
2436  FLASH_Queue_wait_enqueue( t, arg );
2437 
2438  FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2439  }
2440  }
2441 
2442  return r;
2443 }
int height
Definition: FLA_type_defs.h:191
FLA_Bool FLASH_Queue_get_sorting(void)
Definition: FLASH_Queue.c:332
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLA_Lock * run_lock
Definition: FLASH_Queue_exec.c:62
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
int queue
Definition: FLA_type_defs.h:190
FLA_Bool hit
Definition: FLA_type_defs.h:194
Definition: FLASH_Queue_exec.c:54
void FLASH_Queue_wait_enqueue(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:626

◆ FLASH_Task_update_dependencies()

FLASH_Task * FLASH_Task_update_dependencies ( FLASH_Task t,
void *  arg 
)

References FLASH_Task_s::cache, FLASH_Task_s::dep_arg_head, FLASH_Queue_variables::dep_lock, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_caching(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_update_binding(), i, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Dep_s::next_dep, FLASH_Task_s::order, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::run_lock, and FLASH_Dep_s::task.

Referenced by FLASH_Queue_exec_parallel_function().

3050 {
3051  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
3052  int i;
3053  int n_threads = FLASH_Queue_get_num_threads();
3054  int thread;
3055  FLA_Bool available;
3056  FLASH_Task* task;
3057  FLASH_Task* r = NULL;
3058  FLASH_Dep* d = t->dep_arg_head;
3059 
3060  // Check each dependent task.
3061  for ( i = 0; i < t->n_dep_args; i++ )
3062  {
3063  task = d->task;
3064 
3065  // Use the remaining locks except for the first one.
3066  thread = ( n_threads > 1 ? task->order % ( n_threads - 1 ) + 1 : 0 );
3067 
3068  RCCE_acquire_lock( thread );
3069 
3070  args->n_ready[task->order]--;
3071  available = ( args->n_ready[task->order] == 0 );
3072 
3073  RCCE_release_lock( thread );
3074 
3075  // Place newly ready tasks on waiting queue.
3076  if ( available )
3077  {
3078  RCCE_acquire_lock( 0 );
3079 
3080  FLASH_Queue_wait_enqueue( task, arg );
3081 
3082  RCCE_release_lock( 0 );
3083  }
3084 
3085  // Go to the next dep.
3086  d = d->next_dep;
3087  }
3088 
3089  return r;
3090 }
int RCCE_release_lock(int)
int * n_ready
Definition: FLASH_Queue_exec.c:2742
FLASH_Dep * dep_arg_head
Definition: FLA_type_defs.h:232
Definition: FLA_type_defs.h:244
Definition: FLA_type_defs.h:183
FLASH_Task * task
Definition: FLA_type_defs.h:247
FLASH_Dep * next_dep
Definition: FLA_type_defs.h:250
int n_dep_args
Definition: FLA_type_defs.h:231
int order
Definition: FLA_type_defs.h:189
Definition: FLASH_Queue_exec.c:54
void FLASH_Queue_wait_enqueue(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:626
int FLA_Bool
Definition: FLA_type_defs.h:46
int i
Definition: bl1_axmyv2.c:145
int RCCE_acquire_lock(int)
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223