libflame  revision_anchor
Data Structures | Typedefs | Functions
FLASH_Queue_exec.c File Reference

(r)

Data Structures

struct  FLA_Obj_gpu_struct
 
struct  FLASH_Queue_variables
 
struct  FLASH_Queue_variables
 

Typedefs

typedef struct FLA_Obj_gpu_struct FLA_Obj_gpu
 
typedef struct FLASH_Queue_variables FLASH_Queue_vars
 

Functions

void FLASH_Queue_exec (void)
 
void FLASH_Queue_init_tasks (void *arg)
 
void FLASH_Queue_wait_enqueue (FLASH_Task *t, void *arg)
 
FLASH_TaskFLASH_Queue_wait_dequeue (int queue, int cache, void *arg)
 
FLASH_TaskFLASH_Queue_wait_dequeue_block (int queue, int cache, void *arg)
 
void FLASH_Queue_update_cache (FLASH_Task *t, void *arg)
 
void FLASH_Queue_update_cache_block (FLA_Obj obj, int cache, FLA_Bool output, void *arg)
 
void FLASH_Queue_prefetch (int cache, void *arg)
 
void FLASH_Queue_prefetch_block (FLA_Obj obj)
 
FLASH_TaskFLASH_Queue_work_stealing (int queue, void *arg)
 
void FLASH_Queue_create_gpu (int thread, void *arg)
 
void FLASH_Queue_destroy_gpu (int thread, void *arg)
 
FLA_Bool FLASH_Queue_exec_gpu (FLASH_Task *t, void *arg)
 
FLA_Bool FLASH_Queue_check_gpu (FLASH_Task *t, void *arg)
 
FLA_Bool FLASH_Queue_check_block_gpu (FLA_Obj obj, int thread, void *arg)
 
void FLASH_Queue_update_gpu (FLASH_Task *t, void **input_arg, void **output_arg, void *arg)
 
void FLASH_Queue_update_block_gpu (FLA_Obj obj, void **buffer_gpu, int thread, void *arg)
 
void FLASH_Queue_mark_gpu (FLASH_Task *t, void *arg)
 
void FLASH_Queue_invalidate_block_gpu (FLA_Obj obj, int thread, void *arg)
 
void FLASH_Queue_flush_block_gpu (FLA_Obj obj, int thread, void *arg)
 
void FLASH_Queue_flush_gpu (int thread, void *arg)
 
void FLASH_Queue_exec_parallel (void *arg)
 
void * FLASH_Queue_exec_parallel_function (void *arg)
 
FLASH_TaskFLASH_Task_update_dependencies (FLASH_Task *t, void *arg)
 
FLASH_TaskFLASH_Task_update_binding (FLASH_Task *t, FLASH_Task *r, void *arg)
 
void FLASH_Task_free_parallel (FLASH_Task *t, void *arg)
 
void FLASH_Queue_exec_simulation (void *arg)
 
int RCCE_acquire_lock (int)
 
int RCCE_release_lock (int)
 
double RCCE_wtime (void)
 
int RCCE_ue (void)
 
void Synch_all ()
 

Typedef Documentation

◆ FLA_Obj_gpu

◆ FLASH_Queue_vars

Function Documentation

◆ FLASH_Queue_check_block_gpu()

FLA_Bool FLASH_Queue_check_block_gpu ( FLA_Obj  obj,
int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.

Referenced by FLASH_Queue_check_gpu().

1552 {
1553  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1554  int k;
1555  dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1556  FLA_Bool r_val = TRUE;
1557 
1558 #ifdef FLA_ENABLE_MULTITHREADING
1559  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1560 #endif
1561 
1562  // Locate the position of the block on the GPU.
1563  for ( k = 0; k < gpu_n_blocks; k++ )
1564  if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1565  break;
1566 
1567  if ( k < gpu_n_blocks )
1568  {
1569  // Request this block if it is dirty.
1570  if ( !args->gpu[thread * gpu_n_blocks + k].clean )
1571  {
1572  args->gpu[thread * gpu_n_blocks + k].request = TRUE;
1573 
1574  r_val = FALSE;
1575  }
1576  }
1577 
1578  // Check the victim block.
1579  if ( obj.base == args->victim[thread].obj.base )
1580  r_val = FALSE;
1581 
1582 #ifdef FLA_ENABLE_MULTITHREADING
1583  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1584 #endif
1585 
1586  return r_val;
1587 }
FLA_Bool request
Definition: FLASH_Queue_exec.c:49
unsigned long dim_t
Definition: FLA_type_defs.h:71
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
FLA_Obj_gpu * victim
Definition: FLASH_Queue_exec.c:107
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLA_Bool clean
Definition: FLASH_Queue_exec.c:46
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
Definition: FLASH_Queue_exec.c:54
int FLA_Bool
Definition: FLA_type_defs.h:46
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
FLA_Lock * gpu_lock
Definition: FLASH_Queue_exec.c:101
FLA_Obj obj
Definition: FLASH_Queue_exec.c:40

◆ FLASH_Queue_check_gpu()

FLA_Bool FLASH_Queue_check_gpu ( FLASH_Task t,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_block_gpu(), FLASH_Queue_get_num_threads(), i, FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec_gpu().

1453 {
1454  int i, j, k;
1455  int thread = t->thread;
1456  int n_input_args = t->n_input_args;
1457  int n_output_args = t->n_output_args;
1458  int n_threads = FLASH_Queue_get_num_threads();
1459  FLA_Bool r_val = TRUE;
1460  FLA_Bool t_val;
1461  FLA_Bool duplicate;
1462  FLA_Obj obj;
1463 
1464  // Check the input and output arguments on the GPUs.
1465  for ( i = 0; i < n_input_args + n_output_args; i++ )
1466  {
1467  // Check for duplicate blocks.
1468  duplicate = FALSE;
1469 
1470  // Find the correct input or output argument.
1471  if ( i < n_input_args )
1472  {
1473  obj = t->input_arg[i];
1474 
1475  for ( j = 0; j < n_output_args && !duplicate; j++ )
1476  {
1477  if ( obj.base == t->output_arg[j].base )
1478  duplicate = TRUE;
1479  }
1480 
1481  for ( j = 0; j < i && !duplicate; j++ )
1482  {
1483  if ( obj.base == t->input_arg[j].base )
1484  duplicate = TRUE;
1485  }
1486  }
1487  else
1488  {
1489  obj = t->output_arg[i - n_input_args];
1490 
1491  for ( j = 0; j < i - n_input_args && !duplicate; j++ )
1492  {
1493  if ( obj.base == t->output_arg[j].base )
1494  duplicate = TRUE;
1495  }
1496  }
1497 
1498  // If the block has not been processed before.
1499  if ( !duplicate )
1500  {
1501  // Macroblock is used.
1502  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1503  {
1504  dim_t jj, kk;
1505  dim_t m = FLA_Obj_length( obj );
1506  dim_t n = FLA_Obj_width( obj );
1507  dim_t cs = FLA_Obj_col_stride( obj );
1508  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
1509 
1510  // Clear each block in macroblock.
1511  for ( jj = 0; jj < n; jj++ )
1512  {
1513  for ( kk = 0; kk < m; kk++ )
1514  {
1515  obj = *( buf + jj * cs + kk );
1516 
1517  t_val = TRUE;
1518 
1519  // Check to see if the block is dirty on another GPU.
1520  for ( k = 0; k < n_threads && t_val; k++ )
1521  if ( k != thread )
1522  t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg );
1523 
1524  r_val = r_val && t_val;
1525  }
1526  }
1527  }
1528  else
1529  {
1530  t_val = TRUE;
1531 
1532  // Check to see if the block is dirty on another GPU.
1533  for ( k = 0; k < n_threads && t_val; k++ )
1534  if ( k != thread )
1535  t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg );
1536 
1537  r_val = r_val && t_val;
1538  }
1539  }
1540  }
1541 
1542  return r_val;
1543 }
unsigned long dim_t
Definition: FLA_type_defs.h:71
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
Definition: FLA_type_defs.h:158
dim_t FLA_Obj_width(FLA_Obj obj)
Definition: FLA_Query.c:123
int n_input_args
Definition: FLA_type_defs.h:217
FLA_Bool FLASH_Queue_check_block_gpu(FLA_Obj obj, int thread, void *arg)
Definition: FLASH_Queue_exec.c:1546
int n_output_args
Definition: FLA_type_defs.h:221
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
FLA_Obj * input_arg
Definition: FLA_type_defs.h:218
int FLA_Bool
Definition: FLA_type_defs.h:46
dim_t FLA_Obj_col_stride(FLA_Obj obj)
Definition: FLA_Query.c:174
int i
Definition: bl1_axmyv2.c:145
int thread
Definition: FLA_type_defs.h:192
dim_t FLA_Obj_length(FLA_Obj obj)
Definition: FLA_Query.c:116
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223
FLA_Elemtype FLA_Obj_elemtype(FLA_Obj obj)
Definition: FLA_Query.c:51

◆ FLASH_Queue_create_gpu()

void FLASH_Queue_create_gpu ( int  thread,
void *  arg 
)

References FLASH_Queue_variables::block_size, FLA_Obj_gpu_struct::buffer_gpu, FLASH_Queue_variables::datatype, FLASH_Queue_alloc_gpu(), FLASH_Queue_bind_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, and i.

Referenced by FLASH_Queue_exec_parallel_function().

1233 {
1234  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1235  int i;
1236  dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1237  dim_t block_size = args->block_size;
1238  FLA_Datatype datatype = args->datatype;
1239 
1240  // Exit if not using GPU.
1241  if ( !FLASH_Queue_get_enabled_gpu() )
1242  return;
1243 
1244  // Bind thread to GPU.
1245  FLASH_Queue_bind_gpu( thread );
1246 
1247  // Allocate the memory on the GPU for all the blocks a priori.
1248  for ( i = 0; i < gpu_n_blocks; i++ )
1249  FLASH_Queue_alloc_gpu( block_size, datatype, &(args->gpu[thread * gpu_n_blocks + i].buffer_gpu) );
1250 
1251  return;
1252 }
unsigned long dim_t
Definition: FLA_type_defs.h:71
FLA_Datatype datatype
Definition: FLASH_Queue_exec.c:116
FLA_Bool FLASH_Queue_get_enabled_gpu(void)
Definition: FLASH_Queue_gpu.c:91
void * buffer_gpu
Definition: FLASH_Queue_exec.c:43
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLA_Error FLASH_Queue_bind_gpu(int thread)
Definition: FLASH_Queue_gpu.c:133
Definition: FLASH_Queue_exec.c:54
dim_t block_size
Definition: FLASH_Queue_exec.c:113
int FLA_Datatype
Definition: FLA_type_defs.h:49
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
FLA_Error FLASH_Queue_alloc_gpu(dim_t size, FLA_Datatype datatype, void **buffer_gpu)
Definition: FLASH_Queue_gpu.c:147
int i
Definition: bl1_axmyv2.c:145

◆ FLASH_Queue_destroy_gpu()

void FLASH_Queue_destroy_gpu ( int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLASH_Queue_free_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, i, and FLA_Obj_gpu_struct::obj.

Referenced by FLASH_Queue_exec_parallel_function().

1261 {
1262  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1263  int i;
1264  dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1265  FLA_Obj_gpu gpu_obj;
1266 
1267  // Exit if not using GPU.
1268  if ( !FLASH_Queue_get_enabled_gpu() )
1269  return;
1270 
1271  // Examine every block left on the GPU.
1272  for ( i = 0; i < gpu_n_blocks; i++ )
1273  {
1274  gpu_obj = args->gpu[thread * gpu_n_blocks + i];
1275 
1276  // Flush the blocks that are dirty.
1277  if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
1278  FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1279 
1280  // Free the memory on the GPU for all the blocks.
1281  FLASH_Queue_free_gpu( gpu_obj.buffer_gpu );
1282  }
1283 
1284  return;
1285 }
unsigned long dim_t
Definition: FLA_type_defs.h:71
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
FLA_Bool FLASH_Queue_get_enabled_gpu(void)
Definition: FLASH_Queue_gpu.c:91
FLA_Error FLASH_Queue_free_gpu(void *buffer_gpu)
Definition: FLASH_Queue_gpu.c:171
void * buffer_gpu
Definition: FLASH_Queue_exec.c:43
FLA_Error FLASH_Queue_read_gpu(FLA_Obj obj, void *buffer_gpu)
Definition: FLASH_Queue_gpu.c:205
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLA_Bool clean
Definition: FLASH_Queue_exec.c:46
Definition: FLASH_Queue_exec.c:54
Definition: FLASH_Queue_exec.c:37
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
int i
Definition: bl1_axmyv2.c:145
FLA_Obj obj
Definition: FLASH_Queue_exec.c:40

◆ FLASH_Queue_exec()

void FLASH_Queue_exec ( void  )

References FLASH_Queue_variables::all_lock, FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLASH_Queue_variables::cac_lock, FLASH_Queue_variables::cache, FLA_Obj_gpu_struct::clean, FLASH_Queue_variables::dep_lock, FLA_Clock(), FLA_free(), FLA_is_owner(), FLA_Lock_destroy(), FLA_Lock_init(), FLA_malloc(), FLA_shfree(), FLA_shmalloc(), FLASH_Queue_exec_parallel(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), FLASH_Queue_get_block_size(), FLASH_Queue_get_cache_size(), FLASH_Queue_get_caching(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_cores_per_queue(), FLASH_Queue_get_data_affinity(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLASH_Queue_get_work_stealing(), FLASH_Queue_init_tasks(), FLASH_Queue_reset(), FLASH_Queue_set_caching(), FLASH_Queue_set_data_affinity(), FLASH_Queue_set_parallel_time(), FLASH_Queue_set_work_stealing(), FLASH_Queue_verbose_output(), FLASH_Task_free(), for(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Queue_variables::gpu_log, FLASH_Queue_s::head, i, FLASH_Queue_variables::n_caches, FLASH_Queue_variables::n_queues, FLASH_Queue_variables::n_ready, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLA_Obj_gpu_struct::obj, FLASH_Queue_variables::pc, FLASH_Queue_variables::prefetch, RCCE_wtime(), FLA_Obj_gpu_struct::request, FLASH_Queue_variables::run_lock, FLASH_Queue_variables::size, Synch_all(), FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, FLASH_Queue_variables::victim, FLASH_Queue_variables::wait_queue, and FLASH_Queue_variables::war_lock.

127 {
128  int n_tasks = FLASH_Queue_get_num_tasks();
129  int n_threads = FLASH_Queue_get_num_threads();
130  int n_queues;
131  int n_caches;
132  int size;
133  int i;
134  dim_t block_size = FLASH_Queue_get_block_size();
135  double dtime;
136 
137  FLA_Lock* run_lock;
138  FLA_Lock* dep_lock;
139  FLA_Lock* war_lock;
140  FLA_Lock* cac_lock;
141 
142  FLA_Obj* cache;
143  FLA_Obj* prefetch;
144  FLASH_Queue* wait_queue;
145 
146 #ifdef FLA_ENABLE_GPU
147 #ifdef FLA_ENABLE_MULTITHREADING
148  FLA_Lock* gpu_lock;
149 #endif
150  FLA_Obj_gpu* gpu;
151  FLA_Obj_gpu* victim;
152  FLA_Obj_gpu* gpu_log;
153  dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
154 #endif
155 
156  // All the necessary variables for the SuperMatrix mechanism.
157  FLASH_Queue_vars args;
158 
159  // If the queue is empty, return early.
160  if ( n_tasks == 0 )
161  return;
162 
163 #ifndef FLA_ENABLE_MULTITHREADING
164  // Turn off work stealing in simulation mode.
166 #endif
167 
168  // Query the number of user set threads per queue.
169  n_queues = FLASH_Queue_get_cores_per_queue();
170 
171  // Default user setting for number of threads.
172  if ( n_queues <= 0 )
173  {
174  // Do not use data affinity or work stealing when caching is enabled.
175  if ( FLASH_Queue_get_caching() )
176  {
177  FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_NONE );
179  }
180 
181  // Do not use work stealing when data affinity is enabled.
182  if ( FLASH_Queue_get_data_affinity() != FLASH_QUEUE_AFFINITY_NONE )
183  {
185  }
186 
187  // Allocate different arrays if using data affinity.
188  n_queues = ( FLASH_Queue_get_data_affinity() ==
189  FLASH_QUEUE_AFFINITY_NONE &&
190  !FLASH_Queue_get_work_stealing() ? 1 : n_threads );
191  }
192  else
193  {
194  // Set the number of queues.
195  n_queues = n_threads / n_queues;
196 
197  // Must use at least one queue.
198  if ( n_queues == 0 )
199  n_queues = 1;
200 
201  if ( n_queues == 1 )
202  {
203  // Turn off all multiple queue implementations.
204  FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_NONE );
206  }
207  else
208  {
209  // Use 2D data affinity for multiple queues if nothing is set.
210  if ( FLASH_Queue_get_data_affinity() == FLASH_QUEUE_AFFINITY_NONE &&
212  {
213  FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC );
214  }
215  }
216  }
217 
218  // Determine the number of caches.
219  n_caches = n_threads / FLASH_Queue_get_cores_per_cache();
220 
221  args.n_queues = n_queues;
222  args.n_caches = n_caches;
223 
224 #ifdef FLA_ENABLE_MULTITHREADING
225  // Allocate memory for array of locks.
226  run_lock = ( FLA_Lock* ) FLA_malloc( n_queues * sizeof( FLA_Lock ) );
227  dep_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) );
228  war_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) );
229  cac_lock = ( FLA_Lock* ) FLA_malloc( n_caches * sizeof( FLA_Lock ) );
230 
231  args.run_lock = run_lock;
232  args.dep_lock = dep_lock;
233  args.war_lock = war_lock;
234  args.cac_lock = cac_lock;
235 
236  // Initialize the all lock.
237  FLA_Lock_init( &(args.all_lock) );
238 
239  // Initialize the run lock for thread i.
240  for ( i = 0; i < n_queues; i++ )
241  {
242  FLA_Lock_init( &(args.run_lock[i]) );
243  }
244 
245  // Initialize the dep and war locks for thread i.
246  for ( i = 0; i < n_threads; i++ )
247  {
248  FLA_Lock_init( &(args.dep_lock[i]) );
249  FLA_Lock_init( &(args.war_lock[i]) );
250  }
251 
252  // Initialize the cac locks for each cache.
253  for ( i = 0; i < n_caches; i++ )
254  {
255  FLA_Lock_init( &(args.cac_lock[i]) );
256  }
257 #endif
258 
259  // The number of blocks that can fit into the cache on each thread.
260  if ( block_size == 0 )
261  size = MIN_CACHE_BLOCKS;
262  else
263  size = max( FLASH_Queue_get_cache_size() / block_size, MIN_CACHE_BLOCKS);
264  args.size = size;
265 
266  // Allocate memory for cache, prefetch buffer, and waiting queue.
267  cache = ( FLA_Obj* ) FLA_malloc( size * n_caches * sizeof( FLA_Obj ) );
268  prefetch = ( FLA_Obj* ) FLA_malloc( size * sizeof( FLA_Obj ) );
269  wait_queue = ( FLASH_Queue* ) FLA_malloc( n_queues * sizeof( FLASH_Queue ));
270 
271  args.cache = cache;
272  args.prefetch = prefetch;
273  args.wait_queue = wait_queue;
274 
275  // Initialize cache, prefetch buffer, and waiting queue.
276  for ( i = 0; i < size * n_caches; i++ )
277  args.cache[i].base = NULL;
278 
279  for ( i = 0; i < size; i++ )
280  args.prefetch[i].base = NULL;
281 
282  for ( i = 0; i < n_queues; i++ )
283  {
284  args.wait_queue[i].n_tasks = 0;
285  args.wait_queue[i].head = NULL;
286  args.wait_queue[i].tail = NULL;
287  }
288 
289  // Initialize the aggregate task counter.
290  args.pc = 0;
291 
292 #ifdef FLA_ENABLE_GPU
293 #ifdef FLA_ENABLE_MULTITHREADING
294  // Allocate and initialize the gpu locks.
295  gpu_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) );
296  args.gpu_lock = gpu_lock;
297 
298  for ( i = 0; i < n_threads; i++ )
299  FLA_Lock_init( &(args.gpu_lock[i]) );
300 #endif
301  // Allocate and initialize GPU software cache.
302  gpu = ( FLA_Obj_gpu* ) FLA_malloc( gpu_n_blocks * n_threads * sizeof( FLA_Obj_gpu ) );
303  args.gpu = gpu;
304 
305  for ( i = 0; i < gpu_n_blocks * n_threads; i++ )
306  {
307  args.gpu[i].obj.base = NULL;
308  args.gpu[i].buffer_gpu = NULL;
309  args.gpu[i].clean = TRUE;
310  args.gpu[i].request = FALSE;
311  }
312 
313  victim = ( FLA_Obj_gpu* ) FLA_malloc( n_threads * sizeof( FLA_Obj_gpu ) );
314  args.victim = victim;
315 
316  for ( i = 0; i < n_threads; i++ )
317  args.victim[i].obj.base = NULL;
318 
319  gpu_log = ( FLA_Obj_gpu* ) FLA_malloc( gpu_n_blocks * n_threads * sizeof( FLA_Obj_gpu ) );
320  args.gpu_log = gpu_log;
321 #endif
322 
323  // Initialize tasks with critical information.
324  FLASH_Queue_init_tasks( ( void* ) &args );
325 
326  // Display verbose output before free all tasks.
329 
330  // Start timing the parallel execution.
331  dtime = FLA_Clock();
332 
333 #ifdef FLA_ENABLE_MULTITHREADING
334  // Parallel Execution!
335  FLASH_Queue_exec_parallel( ( void* ) &args );
336 #else
337  // Simulation!
338  FLASH_Queue_exec_simulation( ( void* ) &args );
339 #endif
340 
341  // End timing the parallel execution.
342  dtime = FLA_Clock() - dtime;
344 
345 #ifdef FLA_ENABLE_MULTITHREADING
346  // Destroy the locks.
347  FLA_Lock_destroy( &(args.all_lock) );
348 
349  for ( i = 0; i < n_queues; i++ )
350  {
351  FLA_Lock_destroy( &(args.run_lock[i]) );
352  }
353 
354  for ( i = 0; i < n_threads; i++ )
355  {
356  FLA_Lock_destroy( &(args.dep_lock[i]) );
357  FLA_Lock_destroy( &(args.war_lock[i]) );
358  }
359 
360  for ( i = 0; i < n_caches; i++ )
361  {
362  FLA_Lock_destroy( &(args.cac_lock[i]) );
363  }
364 
365  // Deallocate memory.
366  FLA_free( run_lock );
367  FLA_free( dep_lock );
368  FLA_free( war_lock );
369  FLA_free( cac_lock );
370 #endif
371 
372  FLA_free( cache );
373  FLA_free( prefetch );
374  FLA_free( wait_queue );
375 
376 #ifdef FLA_ENABLE_GPU
377 #ifdef FLA_ENABLE_MULTITHREADING
378  for ( i = 0; i < n_threads; i++ )
379  FLA_Lock_destroy( &(args.gpu_lock[i]) );
380  FLA_free( gpu_lock );
381 #endif
382  FLA_free( gpu );
383  FLA_free( victim );
384  FLA_free( gpu_log );
385 #endif
386 
387  // Reset values for next call to FLASH_Queue_exec().
389 
390  return;
391 }
dim_t FLASH_Queue_get_block_size(void)
Definition: FLASH_Queue.c:476
int pc
Definition: FLASH_Queue_exec.c:96
dim_t FLASH_Queue_get_cache_size(void)
Definition: FLASH_Queue.c:500
FLA_Obj_gpu * gpu_log
Definition: FLASH_Queue_exec.c:110
FLA_Bool request
Definition: FLASH_Queue_exec.c:49
unsigned long dim_t
Definition: FLA_type_defs.h:71
FLA_Bool FLASH_Queue_get_caching(void)
Definition: FLASH_Queue.c:356
void FLA_Lock_init(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:28
FLA_Lock * war_lock
Definition: FLASH_Queue_exec.c:70
FLA_Obj * cache
Definition: FLASH_Queue_exec.c:86
void FLASH_Queue_verbose_output(void)
Definition: FLASH_Queue.c:1782
int size
Definition: FLASH_Queue_exec.c:83
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
FLA_Lock * cac_lock
Definition: FLASH_Queue_exec.c:74
void FLASH_Queue_reset(void)
Definition: FLASH_Queue.c:583
void FLASH_Queue_exec_simulation(void *arg)
Definition: FLASH_Queue_exec.c:2589
int FLASH_Queue_get_cores_per_queue(void)
Definition: FLASH_Queue.c:572
double FLA_Clock(void)
Definition: FLA_Clock.c:20
void * buffer_gpu
Definition: FLASH_Queue_exec.c:43
FLASH_Task * tail
Definition: FLA_type_defs.h:180
FLA_Obj_gpu * victim
Definition: FLASH_Queue_exec.c:107
Definition: FLA_type_defs.h:158
void FLASH_Queue_exec_parallel(void *arg)
Definition: FLASH_Queue_exec.c:2043
void FLA_free(void *ptr)
Definition: FLA_Memory.c:247
for(i=0;i< n_run;++i)
Definition: bl1_axmyv2.c:188
void FLASH_Queue_init_tasks(void *arg)
Definition: FLASH_Queue_exec.c:394
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
Definition: FLA_type_defs.h:101
FLA_Bool clean
Definition: FLASH_Queue_exec.c:46
FLA_Lock * run_lock
Definition: FLASH_Queue_exec.c:62
void FLA_Lock_destroy(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:73
FLASH_Task * head
Definition: FLA_type_defs.h:179
void FLASH_Queue_set_data_affinity(FLASH_Data_aff data_affinity)
Definition: FLASH_Queue.c:391
void * FLA_malloc(size_t size)
Definition: FLA_Memory.c:111
int n_caches
Definition: FLASH_Queue_exec.c:80
void FLASH_Queue_set_work_stealing(FLA_Bool work_stealing)
Definition: FLASH_Queue.c:367
FLA_Bool FLASH_Queue_get_work_stealing(void)
Definition: FLASH_Queue.c:380
FLA_Obj * prefetch
Definition: FLASH_Queue_exec.c:89
FLASH_Data_aff FLASH_Queue_get_data_affinity(void)
Definition: FLASH_Queue.c:404
Definition: FLASH_Queue_exec.c:54
FLASH_Queue * wait_queue
Definition: FLASH_Queue_exec.c:92
int FLASH_Queue_get_cores_per_cache(void)
Definition: FLASH_Queue.c:548
Definition: FLASH_Queue_exec.c:37
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
void FLASH_Queue_set_parallel_time(double dtime)
Definition: FLASH_Queue.c:448
FLA_Lock * gpu_lock
Definition: FLASH_Queue_exec.c:101
unsigned int FLASH_Queue_get_num_tasks(void)
Definition: FLASH_Queue.c:284
int i
Definition: bl1_axmyv2.c:145
unsigned int n_tasks
Definition: FLA_type_defs.h:176
FLASH_Verbose FLASH_Queue_get_verbose_output(void)
Definition: FLASH_Queue.c:308
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223
FLA_Lock all_lock
Definition: FLASH_Queue_exec.c:58
Definition: FLA_type_defs.h:173
FLA_Obj obj
Definition: FLASH_Queue_exec.c:40
int n_queues
Definition: FLASH_Queue_exec.c:77
FLA_Lock * dep_lock
Definition: FLASH_Queue_exec.c:66

◆ FLASH_Queue_exec_gpu()

FLA_Bool FLASH_Queue_exec_gpu ( FLASH_Task t,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Task_s::enabled_gpu, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_malloc(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_exec_task_gpu(), FLASH_Queue_flush_block_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_mark_gpu(), FLASH_Queue_update_gpu(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::hit, i, FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLASH_Task_s::queue, FLASH_Queue_variables::run_lock, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec_parallel_function().

1294 {
1295  void** input_arg;
1296  void** output_arg;
1297 
1298  if ( t == NULL )
1299  return TRUE;
1300 
1301  // If not using the GPU, then execute on CPU.
1302  if ( !FLASH_Queue_get_enabled_gpu() )
1303  {
1304  FLASH_Queue_exec_task( t );
1305 
1306  return TRUE;
1307  }
1308 
1309  // Check if all the operands are ready and up to date.
1310  if ( !FLASH_Queue_check_gpu( t, arg ) )
1311  {
1312  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1313  int queue = t->queue;
1314  t->hit = FALSE;
1315 
1316 #ifdef FLA_ENABLE_MULTITHREADING
1317  FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
1318 #endif
1319  // Reenqueue the task if the blocks are not all flushed.
1320  FLASH_Queue_wait_enqueue( t, arg );
1321 
1322 #ifdef FLA_ENABLE_MULTITHREADING
1323  FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
1324 #endif
1325 
1326  return FALSE;
1327  }
1328 
1329  // If GPU is enabled, but the task is not supported for GPU execution.
1330  if ( !t->enabled_gpu )
1331  {
1332  int i, j, k;
1333  int thread = t->thread;
1334  int n_input_args = t->n_input_args;
1335  int n_output_args = t->n_output_args;
1336  int n_threads = FLASH_Queue_get_num_threads();
1337  FLA_Bool duplicate;
1338  FLA_Obj obj;
1339 
1340  // Check the blocks on each GPU.
1341  for ( k = 0; k < n_threads; k++ )
1342  {
1343  // Check the input and output arguments on the GPUs.
1344  for ( i = 0; i < n_input_args + n_output_args; i++ )
1345  {
1346  // Check for duplicate blocks.
1347  duplicate = FALSE;
1348 
1349  // Find the correct input or output argument.
1350  if ( i < n_input_args )
1351  {
1352  obj = t->input_arg[i];
1353 
1354  for ( j = 0; j < n_output_args && !duplicate; j++ )
1355  {
1356  if ( obj.base == t->output_arg[j].base )
1357  duplicate = TRUE;
1358  }
1359 
1360  for ( j = 0; j < i && !duplicate; j++ )
1361  {
1362  if ( obj.base == t->input_arg[j].base )
1363  duplicate = TRUE;
1364  }
1365  }
1366  else
1367  {
1368  obj = t->output_arg[i - n_input_args];
1369 
1370  for ( j = 0; j < i - n_input_args && !duplicate; j++ )
1371  {
1372  if ( obj.base == t->output_arg[j].base )
1373  duplicate = TRUE;
1374  }
1375  }
1376 
1377  // If the block has not been processed before.
1378  if ( !duplicate )
1379  {
1380  // Macroblock is used.
1381  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1382  {
1383  dim_t jj, kk;
1384  dim_t m = FLA_Obj_length( obj );
1385  dim_t n = FLA_Obj_width( obj );
1386  dim_t cs = FLA_Obj_col_stride( obj );
1387  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
1388 
1389  // Clear each block in macroblock.
1390  for ( jj = 0; jj < n; jj++ )
1391  {
1392  for ( kk = 0; kk < m; kk++ )
1393  {
1394  obj = *( buf + jj * cs + kk );
1395 
1396  // Flush the block to main memory if it is on the GPU.
1397  if ( k == thread )
1398  FLASH_Queue_flush_block_gpu( obj, k, arg );
1399 
1400  // Invalidate output block on all GPUs.
1401  if ( i >= n_input_args )
1402  FLASH_Queue_invalidate_block_gpu( obj, k, arg );
1403  }
1404  }
1405  }
1406  else
1407  {
1408  // Flush the block to main memory if it is on the GPU.
1409  if ( k == thread )
1410  FLASH_Queue_flush_block_gpu( obj, k, arg );
1411 
1412  // Invalidate output block on all GPUs.
1413  if ( i >= n_input_args )
1414  FLASH_Queue_invalidate_block_gpu( obj, k, arg );
1415  }
1416  }
1417  }
1418  }
1419 
1420  // Execute the task on CPU instead of GPU.
1421  FLASH_Queue_exec_task( t );
1422 
1423  return TRUE;
1424  }
1425 
1426  // Gather the pointers for the data on the GPU.
1427  input_arg = ( void** ) FLA_malloc( t->n_input_args * sizeof( void* ) );
1428  output_arg = ( void** ) FLA_malloc( t->n_output_args * sizeof( void* ) );
1429 
1430  // Bring all the blocks to GPU.
1431  FLASH_Queue_update_gpu( t, input_arg, output_arg, arg );
1432 
1433  // Execute the task on GPU.
1434  FLASH_Queue_exec_task_gpu( t, input_arg, output_arg );
1435 
1436  // Mark all the output blocks as dirty.
1437  FLASH_Queue_mark_gpu( t, arg );
1438 
1439  // Free memory.
1440  FLA_free( input_arg );
1441  FLA_free( output_arg );
1442 
1443  return TRUE;
1444 }
unsigned long dim_t
Definition: FLA_type_defs.h:71
void FLASH_Queue_invalidate_block_gpu(FLA_Obj obj, int thread, void *arg)
Definition: FLASH_Queue_exec.c:1844
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
FLA_Bool FLASH_Queue_get_enabled_gpu(void)
Definition: FLASH_Queue_gpu.c:91
FLA_Bool enabled_gpu
Definition: FLA_type_defs.h:206
Definition: FLA_type_defs.h:158
dim_t FLA_Obj_width(FLA_Obj obj)
Definition: FLA_Query.c:123
void FLA_free(void *ptr)
Definition: FLA_Memory.c:247
int n_input_args
Definition: FLA_type_defs.h:217
FLA_Lock * run_lock
Definition: FLASH_Queue_exec.c:62
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
int n_output_args
Definition: FLA_type_defs.h:221
void * FLA_malloc(size_t size)
Definition: FLA_Memory.c:111
int queue
Definition: FLA_type_defs.h:190
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
void FLASH_Queue_exec_task(FLASH_Task *t)
Definition: FLASH_Queue.c:1141
FLA_Bool hit
Definition: FLA_type_defs.h:194
Definition: FLASH_Queue_exec.c:54
void FLASH_Queue_wait_enqueue(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:626
FLA_Obj * input_arg
Definition: FLA_type_defs.h:218
int FLA_Bool
Definition: FLA_type_defs.h:46
void FLASH_Queue_exec_task_gpu(FLASH_Task *t, void **input_arg, void **output_arg)
Definition: FLASH_Queue_gpu.c:225
dim_t FLA_Obj_col_stride(FLA_Obj obj)
Definition: FLA_Query.c:174
int i
Definition: bl1_axmyv2.c:145
void FLASH_Queue_flush_block_gpu(FLA_Obj obj, int thread, void *arg)
Definition: FLASH_Queue_exec.c:1893
int thread
Definition: FLA_type_defs.h:192
dim_t FLA_Obj_length(FLA_Obj obj)
Definition: FLA_Query.c:116
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223
FLA_Elemtype FLA_Obj_elemtype(FLA_Obj obj)
Definition: FLA_Query.c:51
void FLASH_Queue_update_gpu(FLASH_Task *t, void **input_arg, void **output_arg, void *arg)
Definition: FLASH_Queue_exec.c:1590
FLA_Bool FLASH_Queue_check_gpu(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:1447
void FLASH_Queue_mark_gpu(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:1787

◆ FLASH_Queue_exec_parallel()

void FLASH_Queue_exec_parallel ( void *  arg)

References FLA_Check_pthread_create_result(), FLA_Check_pthread_join_result(), FLA_free(), FLA_malloc(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_get_num_threads(), and i.

Referenced by FLASH_Queue_exec().

2049 {
2050  int i;
2051  int n_threads = FLASH_Queue_get_num_threads();
2052  void* (*thread_entry_point)( void* );
2053 
2054  // Allocate the thread structures array. Here, an array of FLASH_Thread
2055  // structures of length n_threads is allocated and the fields of each
2056  // structure set to appropriate values.
2057  FLASH_Thread* thread = ( FLASH_Thread* ) FLA_malloc( n_threads * sizeof( FLASH_Thread ) );
2058 
2059  // Initialize the thread structures array.
2060  for ( i = 0; i < n_threads; i++ )
2061  {
2062  // Save the thread's identifier.
2063  thread[i].id = i;
2064 
2065  // Save the pointer to the necessary variables with the thread.
2066  thread[i].args = arg;
2067 
2068  // The pthread object, if it was even compiled into the FLASH_Thread
2069  // structure, will be initialized by the pthread implementation when we
2070  // call pthread_create() and does not need to be touched at this time.
2071  }
2072 
2073  // Determine which function to send threads to.
2074  thread_entry_point = FLASH_Queue_exec_parallel_function;
2075 
2076 #if FLA_MULTITHREADING_MODEL == FLA_OPENMP
2077 
2078  // An OpenMP parallel for region spawns n_threads threads. Each thread
2079  // executes the work function with a different FLASH_Thread argument.
2080  // An implicit synchronization point exists at the end of the curly
2081  // brace scope.
2082  #pragma omp parallel for \
2083  private( i ) \
2084  shared( thread, n_threads, thread_entry_point ) \
2085  schedule( static, 1 ) \
2086  num_threads( n_threads )
2087  for ( i = 0; i < n_threads; ++i )
2088  {
2089  thread_entry_point( ( void* ) &thread[i] );
2090  }
2091 
2092 #elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
2093 
2094  // Create each POSIX thread needed in addition to the main thread.
2095  for ( i = 1; i < n_threads; i++ )
2096  {
2097  int pthread_e_val;
2098 
2099  // Create thread i with default attributes.
2100  pthread_e_val = pthread_create( &(thread[i].pthread_obj),
2101  NULL,
2102  thread_entry_point,
2103  ( void* ) &thread[i] );
2104 
2105 #ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
2106  FLA_Error e_val = FLA_Check_pthread_create_result( pthread_e_val );
2107  FLA_Check_error_code( e_val );
2108 #endif
2109  }
2110 
2111  // The main thread is assigned the role of thread 0. Here we manually
2112  // execute it as a worker thread.
2113  thread_entry_point( ( void* ) &thread[0] );
2114 
2115  // Wait for non-main threads to finish.
2116  for ( i = 1; i < n_threads; i++ )
2117  {
2118  // These two variables are declared local to this for loop since this
2119  // is the only place they are needed, and since they would show up as
2120  // unused variables if FLA_MULTITHREADING_MODEL == FLA_PTHREADS.
2121  // Strangely, the Intel compiler produces code that results in an
2122  // "unaligned access" runtime message if thread_status is declared as
2123  // an int. Declaring it as a long or void* appears to force the
2124  // compiler (not surprisingly) into aligning it to an 8-byte boundary.
2125  int pthread_e_val;
2126  void* thread_status;
2127 
2128  // Wait for thread i to invoke its respective pthread_exit().
2129  // The return value passed to pthread_exit() is provided to us
2130  // via status, if one was given.
2131  pthread_e_val = pthread_join( thread[i].pthread_obj,
2132  ( void** ) &thread_status );
2133 
2134 #ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
2135  FLA_Error e_val = FLA_Check_pthread_join_result( pthread_e_val );
2136  FLA_Check_error_code( e_val );
2137 #endif
2138  }
2139 
2140 #endif
2141 
2142  FLA_free( thread );
2143 
2144  return;
2145 }
int FLA_Error
Definition: FLA_type_defs.h:47
void FLA_free(void *ptr)
Definition: FLA_Memory.c:247
FLA_Error FLA_Check_pthread_join_result(int pthread_join_r_val)
Definition: FLA_Check.c:760
void * FLA_malloc(size_t size)
Definition: FLA_Memory.c:111
Definition: FLA_type_defs.h:254
void * FLASH_Queue_exec_parallel_function(void *arg)
Definition: FLASH_Queue_exec.c:2156
int i
Definition: bl1_axmyv2.c:145
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223
FLA_Error FLA_Check_pthread_create_result(int pthread_create_r_val)
Definition: FLA_Check.c:750

◆ FLASH_Queue_exec_parallel_function()

void* FLASH_Queue_exec_parallel_function ( void *  arg)

References FLASH_Thread_s::args, FLASH_Task_s::cache, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_create_gpu(), FLASH_Queue_destroy_gpu(), FLASH_Queue_exec_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_flush_gpu(), FLASH_Queue_get_caching(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_work_stealing(), FLASH_Task_free_parallel(), FLASH_Task_update_dependencies(), i, FLASH_Thread_s::id, FLASH_Queue_variables::n_queues, RCCE_acquire_lock(), RCCE_release_lock(), RCCE_ue(), and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec(), and FLASH_Queue_exec_parallel().

2162 {
2163  FLASH_Queue_vars* args;
2164  int i;
2165  int queue;
2166  int cache;
2167  int n_tasks = FLASH_Queue_get_num_tasks();
2168  int n_threads = FLASH_Queue_get_num_threads();
2169  int n_cores = FLASH_Queue_get_cores_per_cache();
2170  FLA_Bool caching = FLASH_Queue_get_caching();
2172  FLA_Bool committed = TRUE;
2173  FLA_Bool condition = TRUE;
2174  FLA_Bool enabled = FALSE;
2175  FLA_Bool available;
2176  FLASH_Task* t = NULL;
2177  FLASH_Task* r = NULL;
2178  FLASH_Thread* me;
2179  //cpu_set_t cpu_set;
2180 
2181  // Interpret the thread argument as what it really is--a pointer to an
2182  // FLASH_Thread structure.
2183  me = ( FLASH_Thread* ) arg;
2184 
2185  // Extract the variables from the current thread.
2186  args = ( FLASH_Queue_vars* ) me->args;
2187 
2188  // Figure out the id of the current thread.
2189  i = me->id;
2190 
2191  // Set the CPU affinity; We want the current thread i to run only on CPU i.
2192  //CPU_ZERO( &cpu_set );
2193  //CPU_SET( i, &cpu_set );
2194  //sched_setaffinity( syscall( __NR_gettid ), sizeof(cpu_set_t), &cpu_set );
2195 
2196  // Determine to which queue this thread belongs.
2197  queue = i / ( n_threads / args->n_queues );
2198 
2199  // Determine to which cache this thread belongs.
2200  cache = i / n_cores;
2201 
2202 #ifdef FLA_ENABLE_GPU
2203  // Create memory on GPU.
2204  FLASH_Queue_create_gpu( i, ( void* ) args );
2205 
2206  // Save whether GPUs are enabled.
2207  enabled = FLASH_Queue_get_enabled_gpu();
2208 
2209  // Only use each GPU as its own cache when GPUs are enabled.
2210  if ( enabled )
2211  cache = i;
2212 #endif
2213 
2214  // Prefetch blocks into the cache before execution.
2215  if ( caching && !enabled && i % n_cores == 0 )
2216  FLASH_Queue_prefetch( cache, ( void* ) args );
2217 
2218  // Loop until all the tasks have committed.
2219  while ( condition )
2220  {
2221 #ifdef FLA_ENABLE_GPU
2222  // Check to see if any blocks on GPU need to be flushed.
2223  FLASH_Queue_flush_gpu( i, ( void* ) args );
2224 #endif
2225 
2226  // Dequeue a task if there has not been one binded to thread.
2227  if ( r == NULL )
2228  {
2229  FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2230 
2231  // Obtain task to execute.
2232  t = FLASH_Queue_wait_dequeue( queue, cache, ( void* ) args );
2233 
2234  FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2235  }
2236  else
2237  {
2238  // Obtain the binded task.
2239  t = r;
2240  r = NULL;
2241  }
2242 
2243  // Dequeued a task from the waiting queue.
2244  available = ( t != NULL );
2245 
2246  if ( available )
2247  {
2248  // Save the thread and cache that executes the task.
2249  t->thread = i;
2250  t->cache = cache;
2251 
2252  if ( caching && !enabled )
2253  {
2254  // Update the current state of the cache.
2255  FLASH_Queue_update_cache( t, ( void* ) args );
2256  }
2257 
2258 #ifdef FLA_ENABLE_GPU
2259  // Execute the task on GPU.
2260  committed = FLASH_Queue_exec_gpu( t, ( void* ) args );
2261 #else
2262  // Execute the task.
2263  FLASH_Queue_exec_task( t );
2264 #endif
2265 
2266  // If the task has executed or not.
2267  if ( committed )
2268  {
2269  // Update task dependencies.
2270  r = FLASH_Task_update_dependencies( t, ( void* ) args );
2271 
2272  // Free the task once it executes in parallel.
2273  FLASH_Task_free_parallel( t, ( void* ) args );
2274  }
2275  }
2276  else
2277  {
2278  if ( stealing )
2279  {
2280  // Perform work stealing if there are no tasks to dequeue.
2281  r = FLASH_Queue_work_stealing( queue, ( void* ) args );
2282  }
2283  }
2284 
2285  FLA_Lock_acquire( &(args->all_lock) ); // A ***
2286 
2287  // Increment program counter.
2288  if ( available && committed )
2289  args->pc++;
2290 
2291  // Terminate loop.
2292  if ( args->pc >= n_tasks )
2293  condition = FALSE;
2294 
2295  FLA_Lock_release( &(args->all_lock) ); // A ***
2296  }
2297 
2298 #ifdef FLA_ENABLE_GPU
2299  // Destroy and flush contents of GPU back to main memory.
2300  FLASH_Queue_destroy_gpu( i, ( void* ) args );
2301 #endif
2302 
2303 #if FLA_MULTITHREADING_MODEL == FLA_PTHREADS
2304  // If this is a non-main thread, then exit with a zero (normal) error code.
2305  // The main thread cannot call pthread_exit() because this routine never
2306  // returns. The main thread must proceed so it can oversee the joining of
2307  // the exited non-main pthreads.
2308  if ( i != 0 )
2309  pthread_exit( ( void* ) NULL );
2310 #endif
2311 
2312  return ( void* ) NULL;
2313 }
int pc
Definition: FLASH_Queue_exec.c:96
FLA_Bool FLASH_Queue_get_caching(void)
Definition: FLASH_Queue.c:356
FLASH_Task * FLASH_Queue_work_stealing(int queue, void *arg)
Definition: FLASH_Queue_exec.c:1155
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
void FLASH_Queue_flush_gpu(int thread, void *arg)
Definition: FLASH_Queue_exec.c:1961
void FLASH_Queue_destroy_gpu(int thread, void *arg)
Definition: FLASH_Queue_exec.c:1255
FLA_Bool FLASH_Queue_get_enabled_gpu(void)
Definition: FLASH_Queue_gpu.c:91
void FLASH_Queue_prefetch(int cache, void *arg)
Definition: FLASH_Queue_exec.c:1024
void FLASH_Queue_update_cache(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:847
Definition: FLA_type_defs.h:183
FLASH_Task * FLASH_Task_update_dependencies(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:2316
FLA_Bool FLASH_Queue_exec_gpu(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:1288
FLA_Lock * run_lock
Definition: FLASH_Queue_exec.c:62
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
void FLASH_Task_free_parallel(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:2446
Definition: FLA_type_defs.h:254
FLA_Bool FLASH_Queue_get_work_stealing(void)
Definition: FLASH_Queue.c:380
void FLASH_Queue_create_gpu(int thread, void *arg)
Definition: FLASH_Queue_exec.c:1227
void FLASH_Queue_exec_task(FLASH_Task *t)
Definition: FLASH_Queue.c:1141
void * args
Definition: FLA_type_defs.h:260
Definition: FLASH_Queue_exec.c:54
FLASH_Task * FLASH_Queue_wait_dequeue(int queue, int cache, void *arg)
Definition: FLASH_Queue_exec.c:678
int FLASH_Queue_get_cores_per_cache(void)
Definition: FLASH_Queue.c:548
int FLA_Bool
Definition: FLA_type_defs.h:46
unsigned int FLASH_Queue_get_num_tasks(void)
Definition: FLASH_Queue.c:284
int i
Definition: bl1_axmyv2.c:145
int cache
Definition: FLA_type_defs.h:193
int id
Definition: FLA_type_defs.h:257
int thread
Definition: FLA_type_defs.h:192
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223
FLA_Lock all_lock
Definition: FLASH_Queue_exec.c:58
int n_queues
Definition: FLASH_Queue_exec.c:77

◆ FLASH_Queue_exec_simulation()

void FLASH_Queue_exec_simulation ( void *  arg)

References FLASH_Task_s::cache, FLASH_Task_s::dep_arg_head, FLA_free(), FLA_malloc(), FLASH_Queue_exec_task(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_free(), i, FLASH_Task_s::n_dep_args, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Task_s::name, FLASH_Dep_s::next_dep, FLASH_Queue_variables::pc, RCCE_acquire_lock(), RCCE_release_lock(), RCCE_ue(), RCCE_wtime(), Synch_all(), FLASH_Dep_s::task, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec().

2595 {
2596  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2597  int i, j;
2598  int queue;
2599  int cache;
2600  int n_stages = 0;
2601  int n_queues = args->n_queues;
2602  int n_tasks = FLASH_Queue_get_num_tasks();
2603  int n_threads = FLASH_Queue_get_num_threads();
2604  int n_cores = FLASH_Queue_get_cores_per_cache();
2606  FLASH_Task* task;
2607  FLASH_Task* t;
2608  FLASH_Dep* d;
2609 
2610  // An array to hold tasks to be executed during of simulation.
2611 #ifdef FLA_ENABLE_WINDOWS_BUILD
2612  FLASH_Task** exec_array = ( FLASH_Task** ) FLA_malloc( n_threads * sizeof( FLASH_Task* ) );
2613 #else
2614  FLASH_Task* exec_array[n_threads];
2615 #endif
2616 
2617  for ( i = 0; i < n_threads; i++ )
2618  {
2619  // Initialize all exec_array to NULL.
2620  exec_array[i] = NULL;
2621 
2622  // Prefetch blocks into the cache before execution.
2623  if ( i % n_cores == 0 )
2624  FLASH_Queue_prefetch( i, arg );
2625  }
2626 
2627  // Loop until all the tasks have committed.
2628  while ( args->pc < n_tasks )
2629  {
2630  for ( i = 0; i < n_threads; i++ )
2631  {
2632  // Update waiting queue with ready tasks.
2633  t = exec_array[i];
2634 
2635  if ( t != NULL )
2636  {
2637  // Check each dependent task.
2638  d = t->dep_arg_head;
2639 
2640  for ( j = 0; j < t->n_dep_args; j++ )
2641  {
2642  task = d->task;
2643  task->n_ready--;
2644 
2645  // Place newly ready tasks on waiting queue.
2646  if ( task->n_ready == 0 )
2647  {
2648  FLASH_Queue_wait_enqueue( task, arg );
2649  }
2650 
2651  // Go to the next dep.
2652  d = d->next_dep;
2653  }
2654 
2655  // Free the task.
2656  FLASH_Task_free( t );
2657  }
2658  }
2659 
2660  n_stages++;
2661  if ( !verbose )
2662  printf( "%7d", n_stages );
2663 
2664  // Move ready tasks from the waiting queue to execution queue.
2665  for ( i = 0; i < n_threads; i++ )
2666  {
2667  // Determine to which queue this thread belongs.
2668  queue = i / ( n_threads / n_queues );
2669 
2670  // Determine to which cache this thread belongs.
2671  cache = i / n_cores;
2672 
2673  // Dequeue a task.
2674  t = FLASH_Queue_wait_dequeue( queue, cache, arg );
2675 
2676  // Save the task for execution.
2677  exec_array[i] = t;
2678 
2679  if ( t != NULL )
2680  {
2681  // Save the thread and cache that executes the task.
2682  t->thread = i;
2683  t->cache = cache;
2684 
2685  // Increment program counter.
2686  args->pc++;
2687  }
2688  }
2689 
2690  // Execute independent tasks.
2691  for ( i = 0; i < n_threads; i++ )
2692  {
2693  t = exec_array[i];
2694  FLASH_Queue_update_cache( t, arg );
2695  FLASH_Queue_exec_task( t );
2696 
2697  if ( !verbose )
2698  printf( "%7s", ( t == NULL ? " " : t->name ) );
2699 
2700  // Free the task if this is the last stage.
2701  if ( args->pc == n_tasks && t != NULL )
2702  FLASH_Task_free( t );
2703  }
2704 
2705  if ( !verbose )
2706  printf( "\n" );
2707  }
2708 
2709  if ( !verbose )
2710  printf( "\n" );
2711 
2712 #ifdef FLA_ENABLE_WINDOWS_BUILD
2713  FLA_free( exec_array );
2714 #endif
2715 
2716  return;
2717 }
int pc
Definition: FLASH_Queue_exec.c:96
FLASH_Dep * dep_arg_head
Definition: FLA_type_defs.h:232
Definition: FLA_type_defs.h:244
int n_ready
Definition: FLA_type_defs.h:186
void FLASH_Queue_prefetch(int cache, void *arg)
Definition: FLASH_Queue_exec.c:1024
void FLASH_Queue_update_cache(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:847
Definition: FLA_type_defs.h:183
int FLASH_Verbose
Definition: FLA_type_defs.h:113
FLASH_Task * task
Definition: FLA_type_defs.h:247
void FLA_free(void *ptr)
Definition: FLA_Memory.c:247
FLASH_Dep * next_dep
Definition: FLA_type_defs.h:250
int n_dep_args
Definition: FLA_type_defs.h:231
void * FLA_malloc(size_t size)
Definition: FLA_Memory.c:111
void FLASH_Queue_exec_task(FLASH_Task *t)
Definition: FLASH_Queue.c:1141
Definition: FLASH_Queue_exec.c:54
FLASH_Task * FLASH_Queue_wait_dequeue(int queue, int cache, void *arg)
Definition: FLASH_Queue_exec.c:678
void FLASH_Queue_wait_enqueue(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:626
int FLASH_Queue_get_cores_per_cache(void)
Definition: FLASH_Queue.c:548
void FLASH_Task_free(FLASH_Task *t)
Definition: FLASH_Queue.c:1020
unsigned int FLASH_Queue_get_num_tasks(void)
Definition: FLASH_Queue.c:284
int i
Definition: bl1_axmyv2.c:145
int cache
Definition: FLA_type_defs.h:193
int thread
Definition: FLA_type_defs.h:192
FLASH_Verbose FLASH_Queue_get_verbose_output(void)
Definition: FLASH_Queue.c:308
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223
char * name
Definition: FLA_type_defs.h:203
int n_queues
Definition: FLASH_Queue_exec.c:77

◆ FLASH_Queue_flush_block_gpu()

void FLASH_Queue_flush_block_gpu ( FLA_Obj  obj,
int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_gpu().

1899 {
1900  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1901  int k;
1902  dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1903  FLA_Bool transfer = FALSE;
1904  FLA_Obj_gpu gpu_obj;
1905 
1906 #ifdef FLA_ENABLE_MULTITHREADING
1907  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1908 #endif
1909 
1910  // Locate the position of the block on the GPU.
1911  for ( k = 0; k < gpu_n_blocks; k++ )
1912  if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1913  break;
1914 
1915  // The block is owned by the GPU.
1916  if ( k < gpu_n_blocks )
1917  {
1918  // Save the block that will be flushed.
1919  gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1920 
1921  // If the block is dirty, then flush it.
1922  if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
1923  transfer = TRUE;
1924  }
1925 
1926 #ifdef FLA_ENABLE_MULTITHREADING
1927  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1928 #endif
1929 
1930  // Exit early if a flush is not required.
1931  if ( !transfer )
1932  return;
1933 
1934  // Flush the block outside the critical section.
1935  FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1936 
1937 #ifdef FLA_ENABLE_MULTITHREADING
1938  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1939 #endif
1940 
1941  // Locate the position of the block on the GPU.
1942  for ( k = 0; k < gpu_n_blocks; k++ )
1943  if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1944  break;
1945 
1946  if ( k < gpu_n_blocks )
1947  {
1948  // Update the bits for the flushed block.
1949  args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
1950  args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1951  }
1952 
1953 #ifdef FLA_ENABLE_MULTITHREADING
1954  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1955 #endif
1956 
1957  return;
1958 }
FLA_Bool request
Definition: FLASH_Queue_exec.c:49
unsigned long dim_t
Definition: FLA_type_defs.h:71
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
void * buffer_gpu
Definition: FLASH_Queue_exec.c:43
FLA_Error FLASH_Queue_read_gpu(FLA_Obj obj, void *buffer_gpu)
Definition: FLASH_Queue_gpu.c:205
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLA_Bool clean
Definition: FLASH_Queue_exec.c:46
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
Definition: FLASH_Queue_exec.c:54
Definition: FLASH_Queue_exec.c:37
int FLA_Bool
Definition: FLA_type_defs.h:46
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
FLA_Lock * gpu_lock
Definition: FLASH_Queue_exec.c:101
FLA_Obj obj
Definition: FLASH_Queue_exec.c:40

◆ FLASH_Queue_flush_gpu()

void FLASH_Queue_flush_gpu ( int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Queue_variables::gpu_log, i, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_parallel_function().

1967 {
1968  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1969  int i, k;
1970  dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1971  int n_transfer = 0;
1972  FLA_Obj_gpu gpu_obj;
1973 
1974  // Exit if not using GPU.
1975  if ( !FLASH_Queue_get_enabled_gpu() )
1976  return;
1977 
1978 #ifdef FLA_ENABLE_MULTITHREADING
1979  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1980 #endif
1981 
1982  for ( k = 0; k < gpu_n_blocks; k++ )
1983  {
1984  // Save the block that might be flushed.
1985  gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1986 
1987  // Flush the block if it is dirty and requested.
1988  if ( gpu_obj.obj.base != NULL && !gpu_obj.clean && gpu_obj.request )
1989  {
1990  // Save the block for data transfer outside the critical section.
1991  args->gpu_log[thread * gpu_n_blocks + n_transfer] = gpu_obj;
1992  n_transfer++;
1993  }
1994  }
1995 
1996 #ifdef FLA_ENABLE_MULTITHREADING
1997  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1998 #endif
1999 
2000  // Exit early if a flush is not required.
2001  if ( n_transfer == 0 )
2002  return;
2003 
2004  // Flush the block outside the critical section.
2005  for ( i = 0; i < n_transfer; i++ )
2006  {
2007  gpu_obj = args->gpu_log[thread * gpu_n_blocks + i];
2008  FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
2009  }
2010 
2011 #ifdef FLA_ENABLE_MULTITHREADING
2012  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
2013 #endif
2014 
2015  // Update the bits for each block that is flushed.
2016  for ( i = 0; i < n_transfer; i++ )
2017  {
2018  // Locate the position of the block on the GPU.
2019  for ( k = 0; k < gpu_n_blocks; k++ )
2020  if ( args->gpu_log[thread * gpu_n_blocks + i].obj.base ==
2021  args->gpu[thread * gpu_n_blocks + k].obj.base )
2022  break;
2023 
2024  if ( k < gpu_n_blocks )
2025  {
2026  // The block is now clean.
2027  args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
2028  args->gpu[thread * gpu_n_blocks + k].request = FALSE;
2029  }
2030  }
2031 
2032 #ifdef FLA_ENABLE_MULTITHREADING
2033  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
2034 #endif
2035 
2036  return;
2037 }
FLA_Obj_gpu * gpu_log
Definition: FLASH_Queue_exec.c:110
FLA_Bool request
Definition: FLASH_Queue_exec.c:49
unsigned long dim_t
Definition: FLA_type_defs.h:71
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
FLA_Bool FLASH_Queue_get_enabled_gpu(void)
Definition: FLASH_Queue_gpu.c:91
void * buffer_gpu
Definition: FLASH_Queue_exec.c:43
FLA_Error FLASH_Queue_read_gpu(FLA_Obj obj, void *buffer_gpu)
Definition: FLASH_Queue_gpu.c:205
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLA_Bool clean
Definition: FLASH_Queue_exec.c:46
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
Definition: FLASH_Queue_exec.c:54
Definition: FLASH_Queue_exec.c:37
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
FLA_Lock * gpu_lock
Definition: FLASH_Queue_exec.c:101
int i
Definition: bl1_axmyv2.c:145
FLA_Obj obj
Definition: FLASH_Queue_exec.c:40

◆ FLASH_Queue_init_tasks()

void FLASH_Queue_init_tasks ( void *  arg)

References FLA_Obj_view::base, FLASH_Queue_variables::block_size, FLASH_Queue_variables::datatype, FLASH_Task_s::dep_arg_head, FLA_is_owner(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_datatype_size(), FLA_Obj_elemtype(), FLA_Obj_free_buffer_task(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_data_affinity(), FLASH_Queue_get_head_task(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_tail_task(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::func, FLASH_Task_s::height, i, FLASH_Task_s::input_arg, FLA_Obj_struct::m_index, FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_index, FLASH_Task_s::n_input_args, FLASH_Task_s::n_macro_args, FLASH_Task_s::n_output_args, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Task_s::n_war_args, FLA_Obj_struct::n_write_blocks, FLASH_Dep_s::next_dep, FLASH_Task_s::next_task, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLASH_Queue_variables::prefetch, FLASH_Task_s::prev_task, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::size, FLASH_Dep_s::task, and FLASH_Queue_variables::task_queue.

Referenced by FLASH_Queue_exec().

400 {
401  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
402  int i, j, k;
403  int n_tasks = FLASH_Queue_get_num_tasks();
404  int n_queues = args->n_queues;
405  int n_prefetch = 0;
406  int n_ready = 0;
407  int length = 0;
408  int width = 0;
409  int height = 0;
410  int size = args->size;
412  FLASH_Task* t;
413  FLASH_Dep* d;
414  FLA_Obj obj;
415 
416 #ifdef FLA_ENABLE_GPU
417  dim_t block_size = 0;
418  FLA_Datatype datatype = FLA_FLOAT;
419  dim_t datatype_size = FLA_Obj_datatype_size( datatype );
420 #endif
421 
422  // Find the 2D factorization of the number of threads.
423  if ( data_aff == FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC )
424  {
425  int sq_rt = 0;
426  while ( sq_rt * sq_rt <= n_queues ) sq_rt++;
427  sq_rt--;
428  while ( n_queues % sq_rt != 0 ) sq_rt--;
429  length = n_queues / sq_rt;
430  width = sq_rt;
431  }
432 
433  // Grab the tail of the task queue.
435 
436  for ( i = n_tasks - 1; i >= 0; i-- )
437  {
438  // Determine data affinity.
439  if ( data_aff == FLASH_QUEUE_AFFINITY_NONE )
440  { // No data affinity
441  t->queue = 0;
442  }
443  else
444  {
445  // Use the first output block to determine data affinity.
446  obj = t->output_arg[0];
447 
448  // Use the top left block of the macroblock.
449  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
450  obj = *FLASH_OBJ_PTR_AT( obj );
451 
452  if ( data_aff == FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC )
453  { // Two-dimensional block cyclic
454  t->queue = ( obj.base->m_index % length ) +
455  ( obj.base->n_index % width ) * length;
456  }
457  else if ( data_aff == FLASH_QUEUE_AFFINITY_1D_ROW_BLOCK_CYCLIC )
458  { // One-dimensional row block cyclic
459  t->queue = obj.base->m_index % n_queues;
460  }
461  else if ( data_aff == FLASH_QUEUE_AFFINITY_1D_COLUMN_BLOCK_CYCLIC )
462  { // One-dimensional column block cyclic
463  t->queue = obj.base->n_index % n_queues;
464  }
465  else
466  { // Round-robin
467  t->queue = t->queue % n_queues;
468  }
469  }
470 
471  // Determine the height of each task in the DAG.
472  height = 0;
473  d = t->dep_arg_head;
474 
475  // Take the maximum height of dependent tasks.
476  for ( j = 0; j < t->n_dep_args; j++ )
477  {
478  height = max( height, d->task->height );
479  d = d->next_dep;
480  }
481 
482  t->height = height + 1;
483 
484  // Since freeing a task is always a leaf, we want to force it to execute
485  // earlier by giving it a greater height in order to reclaim memory.
486  if ( t->func == (void *) FLA_Obj_free_buffer_task )
487  t->height += n_tasks;
488 
489 #ifdef FLA_ENABLE_GPU
490  for ( j = 0; j < t->n_output_args + t->n_input_args; j++ )
491  {
492  // Find the correct input or output argument.
493  if ( j < t->n_output_args )
494  obj = t->output_arg[j];
495  else
496  obj = t->input_arg[j - t->n_output_args];
497 
498  // Macroblock is used.
499  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
500  {
501  dim_t jj, kk;
502  dim_t m = FLA_Obj_length( obj );
503  dim_t n = FLA_Obj_width( obj );
504  dim_t cs = FLA_Obj_col_stride( obj );
505  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
506 
507  // Check each block in macroblock.
508  for ( jj = 0; jj < n; jj++ )
509  {
510  for ( kk = 0; kk < m; kk++ )
511  {
512  obj = *( buf + jj * cs + kk );
513 
514  block_size = max( FLA_Obj_length( obj ) * FLA_Obj_width( obj ), block_size );
515 
516  if ( jj == 0 && FLA_Obj_datatype( obj ) != datatype && FLA_Obj_datatype_size( FLA_Obj_datatype( obj ) ) > datatype_size )
517  {
518  datatype = FLA_Obj_datatype( obj );
519  datatype_size = FLA_Obj_datatype_size( datatype );
520  }
521  }
522  }
523  }
524  else // Regular block.
525  {
526  block_size = max( FLA_Obj_length( obj ) * FLA_Obj_width( obj ), block_size );
527 
528  if ( FLA_Obj_datatype( obj ) != datatype && FLA_Obj_datatype_size( FLA_Obj_datatype( obj ) ) > datatype_size )
529  {
530  datatype = FLA_Obj_datatype( obj );
531  datatype_size = FLA_Obj_datatype_size( datatype );
532  }
533  }
534  }
535 #endif
536 
537  // Find the first blocks accessed each task.
538  if ( n_prefetch < size )
539  {
540  for ( j = 0; j < t->n_output_args; j++ )
541  {
542  obj = t->output_arg[j];
543 
544  // Macroblock is used.
545  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
546  {
547  dim_t jj, kk;
548  dim_t m = FLA_Obj_length( obj );
549  dim_t n = FLA_Obj_width( obj );
550  dim_t cs = FLA_Obj_col_stride( obj );
551  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
552 
553  // Check each block in macroblock.
554  for ( jj = 0; jj < n; jj++ )
555  {
556  for ( kk = 0; kk < m; kk++ )
557  {
558  obj = *( buf + jj * cs + kk );
559 
560  k = obj.base->n_write_blocks;
561 
562  // This block is one of the first blocks to be accessed.
563  if ( k < size && k == n_prefetch )
564  {
565  args->prefetch[k] = obj;
566  n_prefetch++;
567  }
568  }
569  }
570  }
571  else // Regular block.
572  {
573  k = obj.base->n_write_blocks;
574 
575  // This block is one of the first blocks to be accessed.
576  if ( k < size && k == n_prefetch )
577  {
578  args->prefetch[k] = obj;
579  n_prefetch++;
580  }
581  }
582  }
583  }
584 
585  // Find all ready tasks.
586  t->n_ready += t->n_input_args + t->n_output_args +
587  t->n_macro_args + t->n_war_args;
588 
589  if ( t->n_ready == 0 )
590  {
591  // Save the number of ready and available tasks.
592  n_ready++;
593  }
594 
595  // Go to the previous task.
596  t = t->prev_task;
597  }
598 
599  // Grab the head of the task queue.
601 
602  for ( i = 0; i < n_tasks && n_ready > 0; i++ )
603  {
604  if ( t->n_ready == 0 )
605  {
606  // Enqueue all the ready and available tasks.
607  FLASH_Queue_wait_enqueue( t, arg );
608 
609  // Decrement the number of ready tasks left to be enqueued.
610  n_ready--;
611  }
612 
613  // Go to the next task.
614  t = t->next_task;
615  }
616 
617 #ifdef FLA_ENABLE_GPU
618  args->block_size = block_size;
619  args->datatype = datatype;
620 #endif
621 
622  return;
623 }
FLASH_Task * FLASH_Queue_get_head_task(void)
Definition: FLASH_Queue.c:603
int height
Definition: FLA_type_defs.h:191
FLASH_Dep * dep_arg_head
Definition: FLA_type_defs.h:232
unsigned long dim_t
Definition: FLA_type_defs.h:71
FLASH_Task * next_task
Definition: FLA_type_defs.h:237
FLA_Datatype datatype
Definition: FLASH_Queue_exec.c:116
int size
Definition: FLASH_Queue_exec.c:83
Definition: FLA_type_defs.h:244
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
int n_ready
Definition: FLA_type_defs.h:186
dim_t n_index
Definition: FLA_type_defs.h:135
Definition: FLA_type_defs.h:183
FLA_Datatype FLA_Obj_datatype(FLA_Obj obj)
Definition: FLA_Query.c:13
FLASH_Task * task
Definition: FLA_type_defs.h:247
Definition: FLA_type_defs.h:158
dim_t FLA_Obj_width(FLA_Obj obj)
Definition: FLA_Query.c:123
FLASH_Dep * next_dep
Definition: FLA_type_defs.h:250
int n_dep_args
Definition: FLA_type_defs.h:231
FLA_Error FLA_Obj_free_buffer_task(FLA_Obj obj, void *cntl)
Definition: FLA_Obj_free_buffer_task.c:13
int n_input_args
Definition: FLA_type_defs.h:217
int n_output_args
Definition: FLA_type_defs.h:221
int n_war_args
Definition: FLA_type_defs.h:228
int queue
Definition: FLA_type_defs.h:190
FLASH_Task * FLASH_Queue_get_tail_task(void)
Definition: FLASH_Queue.c:614
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
int n_macro_args
Definition: FLA_type_defs.h:225
FLA_Obj * prefetch
Definition: FLASH_Queue_exec.c:89
FLASH_Data_aff FLASH_Queue_get_data_affinity(void)
Definition: FLASH_Queue.c:404
Definition: FLASH_Queue_exec.c:54
FLASH_Task * prev_task
Definition: FLA_type_defs.h:236
dim_t block_size
Definition: FLASH_Queue_exec.c:113
int FLA_Datatype
Definition: FLA_type_defs.h:49
int FLASH_Data_aff
Definition: FLA_type_defs.h:114
int n_write_blocks
Definition: FLA_type_defs.h:146
void FLASH_Queue_wait_enqueue(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:626
FLA_Obj * input_arg
Definition: FLA_type_defs.h:218
unsigned int FLASH_Queue_get_num_tasks(void)
Definition: FLASH_Queue.c:284
dim_t FLA_Obj_col_stride(FLA_Obj obj)
Definition: FLA_Query.c:174
int i
Definition: bl1_axmyv2.c:145
dim_t FLA_Obj_datatype_size(FLA_Datatype datatype)
Definition: FLA_Query.c:61
dim_t FLA_Obj_length(FLA_Obj obj)
Definition: FLA_Query.c:116
dim_t m_index
Definition: FLA_type_defs.h:134
void * func
Definition: FLA_type_defs.h:197
FLA_Elemtype FLA_Obj_elemtype(FLA_Obj obj)
Definition: FLA_Query.c:51
int n_queues
Definition: FLASH_Queue_exec.c:77

◆ FLASH_Queue_invalidate_block_gpu()

void FLASH_Queue_invalidate_block_gpu ( FLA_Obj  obj,
int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_gpu(), and FLASH_Queue_update_gpu().

1850 {
1851  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1852  int j, k;
1853  dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1854  FLA_Obj_gpu gpu_obj;
1855 
1856 #ifdef FLA_ENABLE_MULTITHREADING
1857  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1858 #endif
1859 
1860  // Locate the position of the block on the GPU.
1861  for ( k = 0; k < gpu_n_blocks; k++ )
1862  if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1863  break;
1864 
1865  // The block is owned by other GPU.
1866  if ( k < gpu_n_blocks )
1867  {
1868  // Invalidate the block.
1869  args->gpu[thread * gpu_n_blocks + k].obj.base = NULL;
1870 
1871  args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
1872  args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1873 
1874  // Save the block that will be invalidated.
1875  gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1876 
1877  // Shift all the blocks for the invalidated block.
1878  for ( j = k; j < gpu_n_blocks - 1; j++ )
1879  args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j + 1];
1880 
1881  // Move to the LRU block.
1882  args->gpu[thread * gpu_n_blocks + gpu_n_blocks - 1] = gpu_obj;
1883  }
1884 
1885 #ifdef FLA_ENABLE_MULTITHREADING
1886  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1887 #endif
1888 
1889  return;
1890 }
FLA_Bool request
Definition: FLASH_Queue_exec.c:49
unsigned long dim_t
Definition: FLA_type_defs.h:71
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLA_Bool clean
Definition: FLASH_Queue_exec.c:46
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
Definition: FLASH_Queue_exec.c:54
Definition: FLASH_Queue_exec.c:37
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
FLA_Lock * gpu_lock
Definition: FLASH_Queue_exec.c:101
FLA_Obj obj
Definition: FLASH_Queue_exec.c:40

◆ FLASH_Queue_mark_gpu()

void FLASH_Queue_mark_gpu ( FLASH_Task t,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, i, FLASH_Task_s::n_output_args, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLA_Obj_gpu_struct::request, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec_gpu().

1793 {
1794  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1795  int i, j, k;
1796  int thread = t->thread;
1797  dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1798  FLA_Bool duplicate;
1799  FLA_Obj obj;
1800 
1801  // Mark all the output blocks on the GPU as dirty.
1802  for ( i = t->n_output_args - 1; i >= 0; i-- )
1803  {
1804  obj = t->output_arg[i];
1805 
1806  // Check for duplicate blocks.
1807  duplicate = FALSE;
1808 
1809  for ( j = 0; j < i && !duplicate; j++ )
1810  {
1811  if ( obj.base == t->output_arg[j].base )
1812  duplicate = TRUE;
1813  }
1814 
1815  // If the output block has not been processed before.
1816  if ( !duplicate )
1817  {
1818 #ifdef FLA_ENABLE_MULTITHREADING
1819  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1820 #endif
1821 
1822  // Locate the position of the block on the GPU.
1823  for ( k = 0; k < gpu_n_blocks; k++ )
1824  if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1825  break;
1826 
1827  if ( k < gpu_n_blocks )
1828  {
1829  // Change the bits for the new dirty block.
1830  args->gpu[thread * gpu_n_blocks + k].clean = FALSE;
1831  args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1832  }
1833 
1834 #ifdef FLA_ENABLE_MULTITHREADING
1835  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1836 #endif
1837  }
1838  }
1839 
1840  return;
1841 }
FLA_Bool request
Definition: FLASH_Queue_exec.c:49
unsigned long dim_t
Definition: FLA_type_defs.h:71
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
Definition: FLA_type_defs.h:158
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLA_Bool clean
Definition: FLASH_Queue_exec.c:46
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
int n_output_args
Definition: FLA_type_defs.h:221
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
Definition: FLASH_Queue_exec.c:54
int FLA_Bool
Definition: FLA_type_defs.h:46
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
FLA_Lock * gpu_lock
Definition: FLASH_Queue_exec.c:101
int i
Definition: bl1_axmyv2.c:145
int thread
Definition: FLA_type_defs.h:192
FLA_Obj obj
Definition: FLASH_Queue_exec.c:40

◆ FLASH_Queue_prefetch()

void FLASH_Queue_prefetch ( int  cache,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLASH_Queue_prefetch_block(), i, FLA_Obj_gpu_struct::obj, FLASH_Queue_variables::prefetch, and FLASH_Queue_variables::size.

Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

1030 {
1031  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1032  int i;
1033  int size = args->size;
1034  FLA_Obj obj;
1035 
1036  // Prefetch blocks in opposite order to maintain LRU.
1037  for ( i = size - 1; i >= 0; i-- )
1038  {
1039  obj = args->prefetch[i];
1040 
1041  // Only prefetch if it is a valid block.
1042  if ( obj.base != NULL )
1043  {
1044  // Prefetch the block.
1046 
1047  // Record the prefetched block in the cache.
1048  args->cache[cache * size + i] = obj;
1049  }
1050  }
1051 
1052  return;
1053 }
FLA_Obj * cache
Definition: FLASH_Queue_exec.c:86
void FLASH_Queue_prefetch_block(FLA_Obj obj)
Definition: FLASH_Queue_exec.c:1056
int size
Definition: FLASH_Queue_exec.c:83
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
Definition: FLA_type_defs.h:158
FLA_Obj * prefetch
Definition: FLASH_Queue_exec.c:89
Definition: FLASH_Queue_exec.c:54
int i
Definition: bl1_axmyv2.c:145

◆ FLASH_Queue_prefetch_block()

void FLASH_Queue_prefetch_block ( FLA_Obj  obj)

References FLA_Obj_datatype(), FLA_Obj_elem_size(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_cache_line_size(), i, scomplex::real, and dcomplex::real.

Referenced by FLASH_Queue_prefetch().

1062 {
1063  int i, inc;
1064  int line_size = FLASH_Queue_get_cache_line_size();
1065  int elem_size = FLA_Obj_elem_size( obj );
1066  int length = FLA_Obj_length( obj );
1067  int width = FLA_Obj_width( obj );
1068  FLA_Datatype datatype = FLA_Obj_datatype( obj );
1069 
1070  // Determine stride to prefetch block into cache.
1071  inc = line_size / elem_size;
1072 
1073  // Switch between the four different datatypes.
1074  switch ( datatype )
1075  {
1076  case FLA_FLOAT:
1077  {
1078  float *buffer = ( float * ) FLA_FLOAT_PTR( obj );
1079  float access;
1080 
1081  // Access each cache line of the block.
1082  for ( i = 0; i < length * width; i += inc )
1083  access = buffer[i];
1084 
1085  // Prevent dead code elimination.
1086  access += 1.0;
1087 
1088  break;
1089  }
1090  case FLA_DOUBLE:
1091  {
1092  double *buffer = ( double * ) FLA_DOUBLE_PTR( obj );
1093  double access;
1094 
1095  // Access each cache line of the block.
1096  for ( i = 0; i < length * width; i += inc )
1097  access = buffer[i];
1098 
1099  // Prevent dead code elimination.
1100  access += 1.0;
1101 
1102  break;
1103  }
1104  case FLA_COMPLEX:
1105  {
1106  scomplex *buffer = ( scomplex * ) FLA_COMPLEX_PTR( obj );
1107  scomplex access;
1108 
1109  // Access each cache line of the block.
1110  for ( i = 0; i < length * width; i += inc )
1111  access = buffer[i];
1112 
1113  // Prevent dead code elimination.
1114  access.real += 1.0;
1115 
1116  break;
1117  }
1118  case FLA_DOUBLE_COMPLEX:
1119  {
1120  dcomplex *buffer = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( obj );
1121  dcomplex access;
1122 
1123  // Access each cache line of the block.
1124  for ( i = 0; i < length * width; i += inc )
1125  access = buffer[i];
1126 
1127  // Prevent dead code elimination.
1128  access.real += 1.0;
1129 
1130  break;
1131  }
1132  case FLA_INT:
1133  {
1134  int *buffer = ( int * ) FLA_INT_PTR( obj );
1135  int access;
1136 
1137  // Access each cache line of the block.
1138  for ( i = 0; i < length * width; i += inc )
1139  access = buffer[i];
1140 
1141  // Prevent dead code elimination.
1142  access += 1.0;
1143 
1144  break;
1145  }
1146  default:
1147  // This default case should never execute.
1148  FLA_Check_error_code( FLA_INVALID_DATATYPE );
1149  }
1150 
1151  return;
1152 }
float real
Definition: blis_type_defs.h:134
double real
Definition: blis_type_defs.h:139
FLA_Datatype FLA_Obj_datatype(FLA_Obj obj)
Definition: FLA_Query.c:13
dim_t FLA_Obj_width(FLA_Obj obj)
Definition: FLA_Query.c:123
dim_t FLA_Obj_elem_size(FLA_Obj obj)
Definition: FLA_Query.c:95
dim_t FLASH_Queue_get_cache_line_size(void)
Definition: FLASH_Queue.c:524
Definition: blis_type_defs.h:132
int FLA_Datatype
Definition: FLA_type_defs.h:49
int i
Definition: bl1_axmyv2.c:145
dim_t FLA_Obj_length(FLA_Obj obj)
Definition: FLA_Query.c:116
Definition: blis_type_defs.h:137

◆ FLASH_Queue_update_block_gpu()

void FLASH_Queue_update_block_gpu ( FLA_Obj  obj,
void **  buffer_gpu,
int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_write_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.

Referenced by FLASH_Queue_update_gpu().

1704 {
1705  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1706  int j, k;
1707  dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1708  FLA_Bool transfer = FALSE;
1709  FLA_Bool evict = FALSE;
1710  FLA_Obj_gpu evict_obj;
1711  FLA_Obj_gpu gpu_obj;
1712 
1713 #ifdef FLA_ENABLE_MULTITHREADING
1714  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1715 #endif
1716 
1717  // Locate the position of the block on GPU.
1718  for ( k = 0; k < gpu_n_blocks - 1; k++ )
1719  if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1720  break;
1721 
1722  // Save the pointer to the data on the GPU.
1723  buffer_gpu[0] = args->gpu[thread * gpu_n_blocks + k].buffer_gpu;
1724 
1725  // Save the victim block.
1726  evict_obj = args->gpu[thread * gpu_n_blocks + k];
1727 
1728  // The block is not already in the GPU.
1729  if ( obj.base != args->gpu[thread * gpu_n_blocks + k].obj.base )
1730  {
1731  // Save for data transfer outside of critical section.
1732  transfer = TRUE;
1733 
1734  // Save for eviction outside of critical section.
1735  if ( evict_obj.obj.base != NULL && !evict_obj.clean )
1736  {
1737  evict = TRUE;
1738  args->victim[thread] = evict_obj;
1739  }
1740 
1741  // Save the block in the data structure.
1742  args->gpu[thread * gpu_n_blocks + k].obj = obj;
1743 
1744  // Make sure the new block is clean.
1745  args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
1746  args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1747  }
1748 
1749  // Use the block on the GPU that is a hit or LRU.
1750  gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1751 
1752  // Shift all the previous tasks for LRU replacement.
1753  for ( j = k; j > 0; j-- )
1754  args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j - 1];
1755 
1756  // Place the block on the cache as the most recently used.
1757  args->gpu[thread * gpu_n_blocks] = gpu_obj;
1758 
1759 #ifdef FLA_ENABLE_MULTITHREADING
1760  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1761 #endif
1762 
1763  // Evict and flush the LRU dirty block.
1764  if ( evict )
1765  {
1766  FLASH_Queue_read_gpu( evict_obj.obj, evict_obj.buffer_gpu );
1767 
1768 #ifdef FLA_ENABLE_MULTITHREADING
1769  FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1770 #endif
1771 
1772  args->victim[thread].obj.base = NULL;
1773 
1774 #ifdef FLA_ENABLE_MULTITHREADING
1775  FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1776 #endif
1777  }
1778 
1779  // Move the block to the GPU.
1780  if ( transfer )
1781  FLASH_Queue_write_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1782 
1783  return;
1784 }
FLA_Bool request
Definition: FLASH_Queue_exec.c:49
unsigned long dim_t
Definition: FLA_type_defs.h:71
FLA_Error FLASH_Queue_write_gpu(FLA_Obj obj, void *buffer_gpu)
Definition: FLASH_Queue_gpu.c:185
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
void * buffer_gpu
Definition: FLASH_Queue_exec.c:43
FLA_Error FLASH_Queue_read_gpu(FLA_Obj obj, void *buffer_gpu)
Definition: FLASH_Queue_gpu.c:205
FLA_Obj_gpu * victim
Definition: FLASH_Queue_exec.c:107
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLA_Bool clean
Definition: FLASH_Queue_exec.c:46
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
Definition: FLASH_Queue_exec.c:54
Definition: FLASH_Queue_exec.c:37
int FLA_Bool
Definition: FLA_type_defs.h:46
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
FLA_Lock * gpu_lock
Definition: FLASH_Queue_exec.c:101
FLA_Obj obj
Definition: FLASH_Queue_exec.c:40

◆ FLASH_Queue_update_cache()

void FLASH_Queue_update_cache ( FLASH_Task t,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Task_s::cache, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_update_cache_block(), i, FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_gpu_struct::obj, and FLASH_Task_s::output_arg.

Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

853 {
854  int i, j;
855  FLA_Bool duplicate;
856  FLA_Obj obj;
857 
858  if ( t == NULL )
859  return;
860 
861  // Updating the input blocks.
862  for ( i = t->n_input_args - 1; i >= 0; i-- )
863  {
864  // Check for duplicate blocks.
865  duplicate = FALSE;
866 
867  for ( j = 0; j < t->n_output_args && !duplicate; j++ )
868  {
869  if ( t->input_arg[i].base == t->output_arg[j].base )
870  duplicate = TRUE;
871  }
872 
873  for ( j = 0; j < i && !duplicate; j++ )
874  {
875  if ( t->input_arg[i].base == t->input_arg[j].base )
876  duplicate = TRUE;
877  }
878 
879  // If the input block has not been processed before.
880  if ( !duplicate )
881  {
882  obj = t->input_arg[i];
883 
884  // Macroblock is used.
885  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
886  {
887  dim_t jj, kk;
888  dim_t m = FLA_Obj_length( obj );
889  dim_t n = FLA_Obj_width( obj );
890  dim_t cs = FLA_Obj_col_stride( obj );
891  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
892 
893  // Dependence analysis for each input block in macroblock.
894  for ( jj = 0; jj < n; jj++ )
895  for ( kk = 0; kk < m; kk++ )
896  FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ),
897  t->cache, FALSE, arg );
898  }
899  else // Regular block.
900  {
901  FLASH_Queue_update_cache_block( obj, t->cache, FALSE, arg );
902  }
903  }
904  }
905 
906  // Updating the output blocks.
907  for ( i = t->n_output_args - 1; i >= 0; i-- )
908  {
909  // Check for duplicate blocks.
910  duplicate = FALSE;
911 
912  for ( j = 0; j < i && !duplicate; j++ )
913  {
914  if ( t->output_arg[i].base == t->output_arg[j].base )
915  duplicate = TRUE;
916  }
917 
918  // If the output block has not been processed before.
919  if ( !duplicate )
920  {
921  obj = t->output_arg[i];
922 
923  // Macroblock is used.
924  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
925  {
926  dim_t jj, kk;
927  dim_t m = FLA_Obj_length( obj );
928  dim_t n = FLA_Obj_width( obj );
929  dim_t cs = FLA_Obj_col_stride( obj );
930  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
931 
932  // Dependence analysis for each input block in macroblock.
933  for ( jj = 0; jj < n; jj++ )
934  for ( kk = 0; kk < m; kk++ )
935  FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ),
936  t->cache, TRUE, arg );
937  }
938  else // Regular block.
939  {
940  FLASH_Queue_update_cache_block( obj, t->cache, TRUE, arg );
941  }
942  }
943  }
944 
945  return;
946 }
unsigned long dim_t
Definition: FLA_type_defs.h:71
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
Definition: FLA_type_defs.h:158
dim_t FLA_Obj_width(FLA_Obj obj)
Definition: FLA_Query.c:123
int n_input_args
Definition: FLA_type_defs.h:217
int n_output_args
Definition: FLA_type_defs.h:221
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
void FLASH_Queue_update_cache_block(FLA_Obj obj, int cache, FLA_Bool output, void *arg)
Definition: FLASH_Queue_exec.c:949
FLA_Obj * input_arg
Definition: FLA_type_defs.h:218
int FLA_Bool
Definition: FLA_type_defs.h:46
dim_t FLA_Obj_col_stride(FLA_Obj obj)
Definition: FLA_Query.c:174
int i
Definition: bl1_axmyv2.c:145
int cache
Definition: FLA_type_defs.h:193
dim_t FLA_Obj_length(FLA_Obj obj)
Definition: FLA_Query.c:116
FLA_Elemtype FLA_Obj_elemtype(FLA_Obj obj)
Definition: FLA_Query.c:51

◆ FLASH_Queue_update_cache_block()

void FLASH_Queue_update_cache_block ( FLA_Obj  obj,
int  cache,
FLA_Bool  output,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Queue_variables::cac_lock, FLASH_Queue_variables::cache, FLA_Lock_acquire(), FLA_Lock_release(), i, FLASH_Queue_variables::n_caches, and FLASH_Queue_variables::size.

Referenced by FLASH_Queue_update_cache().

958 {
959  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
960  int i, j, k;
961  int n_caches = args->n_caches;
962  int size = args->size;
963 
964 #ifdef FLA_ENABLE_MULTITHREADING
965  FLA_Lock_acquire( &(args->cac_lock[cache]) ); // C ***
966 #endif
967 
968  // Locate the position of the block in the cache.
969  for ( k = 0; k < size - 1; k++ )
970  {
971  if ( obj.base == args->cache[cache * size + k].base )
972  break;
973  }
974 
975  // Shift all the previous tasks for LRU replacement.
976  for ( j = k; j > 0; j-- )
977  args->cache[cache * size + j] = args->cache[cache * size + j - 1];
978 
979  // Place the block on the cache as the most recently used.
980  args->cache[cache * size] = obj;
981 
982 #ifdef FLA_ENABLE_MULTITHREADING
983  FLA_Lock_release( &(args->cac_lock[cache]) ); // C ***
984 #endif
985 
986  // Write invalidate if updating with output block.
987  if ( output )
988  {
989  for ( i = 0; i < n_caches; i++ )
990  {
991  if ( i != cache )
992  {
993 #ifdef FLA_ENABLE_MULTITHREADING
994  FLA_Lock_acquire( &(args->cac_lock[i]) ); // C ***
995 #endif
996  // Locate the position of the block in the cache.
997  for ( k = 0; k < size; k++ )
998  {
999  if ( obj.base == args->cache[i * size + k].base )
1000  break;
1001  }
1002 
1003  // The block is owned by other thread.
1004  if ( k < size )
1005  {
1006  // Shift all the blocks for the invalidated block.
1007  for ( j = k; j < size - 1; j++ )
1008  args->cache[i * size + j] = args->cache[i * size + j + 1];
1009 
1010  // Invalidate the block.
1011  args->cache[i * size + size - 1].base = NULL;
1012  }
1013 #ifdef FLA_ENABLE_MULTITHREADING
1014  FLA_Lock_release( &(args->cac_lock[i]) ); // C ***
1015 #endif
1016  }
1017  }
1018  }
1019 
1020  return;
1021 }
FLA_Obj * cache
Definition: FLASH_Queue_exec.c:86
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
int size
Definition: FLASH_Queue_exec.c:83
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
FLA_Lock * cac_lock
Definition: FLASH_Queue_exec.c:74
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
int n_caches
Definition: FLASH_Queue_exec.c:80
Definition: FLASH_Queue_exec.c:54
int i
Definition: bl1_axmyv2.c:145

◆ FLASH_Queue_update_gpu()

void FLASH_Queue_update_gpu ( FLASH_Task t,
void **  input_arg,
void **  output_arg,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_update_block_gpu(), i, FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::output_arg, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec_gpu().

1599 {
1600  int i, j, k;
1601  int thread = t->thread;
1602  int n_threads = FLASH_Queue_get_num_threads();
1603  FLA_Bool duplicate;
1604 
1605  // None of the arguments can be macroblocks yet.
1606  // Complicating factor is copying macroblock to contiguous memory on GPU.
1607 
1608  // Bring the input arguments to the GPU.
1609  for ( i = t->n_input_args - 1; i >= 0; i-- )
1610  {
1611  // Check for duplicate blocks.
1612  duplicate = FALSE;
1613 
1614  for ( j = 0; j < t->n_output_args && !duplicate; j++ )
1615  {
1616  if ( t->input_arg[i].base == t->output_arg[j].base )
1617  duplicate = TRUE;
1618  }
1619 
1620  for ( j = 0; j < i && !duplicate; j++ )
1621  {
1622  if ( t->input_arg[i].base == t->input_arg[j].base )
1623  duplicate = TRUE;
1624  }
1625 
1626  // If the input block has not been processed before.
1627  if ( !duplicate )
1628  {
1629  FLASH_Queue_update_block_gpu( t->input_arg[i], input_arg + i, thread, arg );
1630  }
1631  else
1632  {
1633  input_arg[i] = NULL;
1634  }
1635  }
1636 
1637  // Bring the output arguments to the GPU.
1638  for ( i = t->n_output_args - 1; i >= 0; i-- )
1639  {
1640  // Check for duplicate blocks.
1641  duplicate = FALSE;
1642 
1643  for ( j = 0; j < i && !duplicate; j++ )
1644  {
1645  if ( t->output_arg[i].base == t->output_arg[j].base )
1646  duplicate = TRUE;
1647  }
1648 
1649  // If the output block has not been processed before.
1650  if ( !duplicate )
1651  {
1652  FLASH_Queue_update_block_gpu( t->output_arg[i], output_arg + i, thread, arg );
1653 
1654  // Invalidate output blocks on all other GPUs.
1655  for ( k = 0; k < n_threads; k++ )
1656  if ( k != thread )
1658  }
1659  else
1660  {
1661  output_arg[i] = NULL;
1662  }
1663  }
1664 
1665  // Check to see if there are any duplicates.
1666  for ( i = t->n_input_args - 1; i >= 0; i-- )
1667  {
1668  for ( j = 0; j < t->n_output_args && input_arg[i] == NULL; j++ )
1669  {
1670  if ( t->input_arg[i].base == t->output_arg[j].base )
1671  input_arg[i] = output_arg[j];
1672  }
1673 
1674  for ( j = 0; j < i && input_arg[i] == NULL; j++ )
1675  {
1676  if ( t->input_arg[i].base == t->input_arg[j].base )
1677  input_arg[i] = input_arg[j];
1678  }
1679  }
1680 
1681  // Check to see if there are any duplicates.
1682  for ( i = t->n_output_args - 1; i >= 0; i-- )
1683  {
1684  for ( j = 0; j < i && output_arg[i] == NULL; j++ )
1685  {
1686  if ( t->output_arg[i].base == t->output_arg[j].base )
1687  output_arg[i] = output_arg[j];
1688  }
1689  }
1690 
1691  return;
1692 }
void FLASH_Queue_invalidate_block_gpu(FLA_Obj obj, int thread, void *arg)
Definition: FLASH_Queue_exec.c:1844
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
int n_input_args
Definition: FLA_type_defs.h:217
int n_output_args
Definition: FLA_type_defs.h:221
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
FLA_Obj * input_arg
Definition: FLA_type_defs.h:218
int FLA_Bool
Definition: FLA_type_defs.h:46
void FLASH_Queue_update_block_gpu(FLA_Obj obj, void **buffer_gpu, int thread, void *arg)
Definition: FLASH_Queue_exec.c:1695
int i
Definition: bl1_axmyv2.c:145
int thread
Definition: FLA_type_defs.h:192
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223

◆ FLASH_Queue_wait_dequeue()

FLASH_Task* FLASH_Queue_wait_dequeue ( int  queue,
int  cache,
void *  arg 
)

References FLASH_Queue_variables::cac_lock, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_caching(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_wait_dequeue_block(), FLASH_Queue_variables::gpu_lock, FLASH_Queue_s::head, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLASH_Task_s::next_wait, FLASH_Queue_variables::pc, FLASH_Task_s::prev_wait, FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), and FLASH_Task_update_dependencies().

684 {
685  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
686  FLASH_Task* t = NULL;
687  FLA_Bool enabled = FALSE;
688 
689 #ifdef FLA_ENABLE_GPU
690  enabled = FLASH_Queue_get_enabled_gpu();
691 #endif
692 
693  if ( args->wait_queue[queue].n_tasks > 0 )
694  {
695  // Dequeue the first task.
696  t = args->wait_queue[queue].head;
697 
698  if ( args->wait_queue[queue].n_tasks == 1 )
699  {
700  // Clear the queue of its only task.
701  args->wait_queue[queue].head = NULL;
702  args->wait_queue[queue].tail = NULL;
703  }
704  else
705  {
706  // Grab a new task if using cache affinity.
707  if ( FLASH_Queue_get_caching() )
708  {
709  // Determine if using GPU or not.
710  if ( enabled )
711  {
712 #ifdef FLA_ENABLE_GPU
713 #ifdef FLA_ENABLE_MULTITHREADING
714  FLA_Lock_acquire( &(args->gpu_lock[cache]) ); // G ***
715 #endif
716  // Find a task where the task has blocks currently in GPU.
717  t = FLASH_Queue_wait_dequeue_block( queue, cache, arg );
718 
719 #ifdef FLA_ENABLE_MULTITHREADING
720  FLA_Lock_release( &(args->gpu_lock[cache]) ); // G ***
721 #endif
722 #endif
723  }
724  else
725  {
726 #ifdef FLA_ENABLE_MULTITHREADING
727  FLA_Lock_acquire( &(args->cac_lock[cache]) ); // C ***
728 #endif
729  // Find a task where the task has blocks currently in cache.
730  t = FLASH_Queue_wait_dequeue_block( queue, cache, arg );
731 
732 #ifdef FLA_ENABLE_MULTITHREADING
733  FLA_Lock_release( &(args->cac_lock[cache]) ); // C ***
734 #endif
735  }
736 
737  // Adjust pointers if the task is head of waiting queue.
738  if ( t->prev_wait == NULL )
739  {
740  args->wait_queue[queue].head = t->next_wait;
741  args->wait_queue[queue].head->prev_wait = NULL;
742  }
743  else
744  {
745  t->prev_wait->next_wait = t->next_wait;
746  }
747 
748  // Adjust pointers if the task is tail of waiting queue.
749  if ( t->next_wait == NULL )
750  {
751  args->wait_queue[queue].tail = t->prev_wait;
752  args->wait_queue[queue].tail->next_wait = NULL;
753  }
754  else
755  {
756  t->next_wait->prev_wait = t->prev_wait;
757  }
758  }
759  else
760  {
761  // Adjust pointers in waiting queue.
762  args->wait_queue[queue].head = t->next_wait;
763  args->wait_queue[queue].head->prev_wait = NULL;
764  }
765  }
766 
767  // Clear the task's waiting linked list pointers.
768  t->prev_wait = NULL;
769  t->next_wait = NULL;
770 
771  // Decrement number of tasks on waiting queue.
772  args->wait_queue[queue].n_tasks--;
773  }
774 
775  return t;
776 }
FLA_Bool FLASH_Queue_get_caching(void)
Definition: FLASH_Queue.c:356
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLASH_Task * prev_wait
Definition: FLA_type_defs.h:240
FLASH_Task * FLASH_Queue_wait_dequeue_block(int queue, int cache, void *arg)
Definition: FLASH_Queue_exec.c:779
FLA_Lock * cac_lock
Definition: FLASH_Queue_exec.c:74
FLA_Bool FLASH_Queue_get_enabled_gpu(void)
Definition: FLASH_Queue_gpu.c:91
Definition: FLA_type_defs.h:183
FLASH_Task * next_wait
Definition: FLA_type_defs.h:241
FLASH_Task * tail
Definition: FLA_type_defs.h:180
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
FLASH_Task * head
Definition: FLA_type_defs.h:179
Definition: FLASH_Queue_exec.c:54
FLASH_Queue * wait_queue
Definition: FLASH_Queue_exec.c:92
int FLA_Bool
Definition: FLA_type_defs.h:46
FLA_Lock * gpu_lock
Definition: FLASH_Queue_exec.c:101
unsigned int n_tasks
Definition: FLA_type_defs.h:176

◆ FLASH_Queue_wait_dequeue_block()

FLASH_Task* FLASH_Queue_wait_dequeue_block ( int  queue,
int  cache,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLA_Obj_elemtype(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_s::head, FLASH_Task_s::hit, i, FLASH_Task_s::n_output_args, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_wait, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLASH_Queue_variables::size, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_wait_dequeue().

785 {
786  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
787  int i, j, k;
788  int size = args->size;
789  int n_tasks = args->wait_queue[queue].n_tasks;
790  FLA_Bool enabled = FALSE;
791  FLASH_Task* t;
792  FLA_Obj obj;
793  FLA_Obj mem;
794 
795 #ifdef FLA_ENABLE_GPU
796  enabled = FLASH_Queue_get_enabled_gpu();
797 
798  // If using GPUs, then only check GPU and not the cache.
799  if ( enabled )
801 #endif
802 
803  t = args->wait_queue[queue].head;
804 
805  // Check if any of the output blocks are in the cache.
806  for ( i = 0; i < n_tasks; i++ )
807  {
808  for ( j = 0; j < size; j++ )
809  {
810  // Initialize the memory just in case.
811  mem.base = NULL;
812 
813  // Determine if using GPU or not.
814  if ( enabled )
815  {
816 #ifdef FLA_ENABLE_GPU
817  mem = args->gpu[cache * size + j].obj;
818 #endif
819  }
820  else
821  {
822  mem = args->cache[cache * size + j];
823  }
824 
825  for ( k = 0; k < t->n_output_args; k++ )
826  {
827  obj = t->output_arg[k];
828 
829  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
830  obj = *FLASH_OBJ_PTR_AT( obj );
831 
832  // Return the task if its output block is in cache.
833  if ( mem.base == obj.base )
834  {
835  t->hit = TRUE;
836  return t;
837  }
838  }
839  }
840  t = t->next_wait;
841  }
842 
843  return args->wait_queue[queue].head;
844 }
FLA_Obj * cache
Definition: FLASH_Queue_exec.c:86
int size
Definition: FLASH_Queue_exec.c:83
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
FLA_Bool FLASH_Queue_get_enabled_gpu(void)
Definition: FLASH_Queue_gpu.c:91
Definition: FLA_type_defs.h:183
FLASH_Task * next_wait
Definition: FLA_type_defs.h:241
Definition: FLA_type_defs.h:158
dim_t FLASH_Queue_get_gpu_num_blocks(void)
Definition: FLASH_Queue_gpu.c:119
FLASH_Task * head
Definition: FLA_type_defs.h:179
int n_output_args
Definition: FLA_type_defs.h:221
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
FLA_Bool hit
Definition: FLA_type_defs.h:194
Definition: FLASH_Queue_exec.c:54
FLASH_Queue * wait_queue
Definition: FLASH_Queue_exec.c:92
int FLA_Bool
Definition: FLA_type_defs.h:46
FLA_Obj_gpu * gpu
Definition: FLASH_Queue_exec.c:104
int i
Definition: bl1_axmyv2.c:145
unsigned int n_tasks
Definition: FLA_type_defs.h:176
FLA_Elemtype FLA_Obj_elemtype(FLA_Obj obj)
Definition: FLA_Query.c:51
FLA_Obj obj
Definition: FLASH_Queue_exec.c:40

◆ FLASH_Queue_wait_enqueue()

void FLASH_Queue_wait_enqueue ( FLASH_Task t,
void *  arg 
)

References FLASH_Queue_get_sorting(), FLASH_Queue_s::head, FLASH_Task_s::height, i, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLASH_Task_s::next_wait, FLASH_Task_s::order, FLASH_Queue_variables::pc, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_exec_gpu(), FLASH_Queue_exec_simulation(), FLASH_Queue_init_tasks(), FLASH_Task_update_binding(), and FLASH_Task_update_dependencies().

632 {
633  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
634  int queue = t->queue;
635 
636  if ( args->wait_queue[queue].n_tasks == 0 )
637  {
638  args->wait_queue[queue].head = t;
639  args->wait_queue[queue].tail = t;
640  }
641  else
642  {
643  t->prev_wait = args->wait_queue[queue].tail;
644 
645  // Insertion sort of tasks in waiting queue.
646  if ( FLASH_Queue_get_sorting() )
647  {
648  while ( t->prev_wait != NULL )
649  {
650  if ( t->prev_wait->height >= t->height )
651  break;
652 
653  t->next_wait = t->prev_wait;
654  t->prev_wait = t->prev_wait->prev_wait;
655  }
656  }
657 
658  // Checking if the task is the head of the waiting queue.
659  if ( t->prev_wait == NULL )
660  args->wait_queue[queue].head = t;
661  else
662  t->prev_wait->next_wait = t;
663 
664  // Checking if the task is the tail of the waiting queue.
665  if ( t->next_wait == NULL )
666  args->wait_queue[queue].tail = t;
667  else
668  t->next_wait->prev_wait = t;
669  }
670 
671  // Increment number of tasks on waiting queue.
672  args->wait_queue[queue].n_tasks++;
673 
674  return;
675 }
int height
Definition: FLA_type_defs.h:191
FLA_Bool FLASH_Queue_get_sorting(void)
Definition: FLASH_Queue.c:332
FLASH_Task * prev_wait
Definition: FLA_type_defs.h:240
FLASH_Task * next_wait
Definition: FLA_type_defs.h:241
FLASH_Task * tail
Definition: FLA_type_defs.h:180
FLASH_Task * head
Definition: FLA_type_defs.h:179
int queue
Definition: FLA_type_defs.h:190
Definition: FLASH_Queue_exec.c:54
FLASH_Queue * wait_queue
Definition: FLASH_Queue_exec.c:92
unsigned int n_tasks
Definition: FLA_type_defs.h:176

◆ FLASH_Queue_work_stealing()

FLASH_Task* FLASH_Queue_work_stealing ( int  queue,
void *  arg 
)

References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_s::head, FLASH_Queue_variables::n_queues, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_wait, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, FLASH_Queue_variables::run_lock, FLASH_Queue_s::tail, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_exec_parallel_function().

1161 {
1162  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1163  int q;
1164  int n_queues = args->n_queues;
1165  FLASH_Task* t = NULL;
1166 
1167  // Do not perform work stealing if there is only one queue.
1168  if ( n_queues == 1 )
1169  return t;
1170 
1171  // Find a random queue not equal to the current queue.
1172  do
1173  {
1174 #ifdef FLA_ENABLE_WINDOWS_BUILD
1175  rand_s( &q );
1176  q = q % n_queues;
1177 #else
1178 #ifdef FLA_ENABLE_TIDSP
1179  q = rand() % n_queues;
1180 #else
1181  q = lrand48() % n_queues;
1182 #endif
1183 #endif
1184  }
1185  while ( q == queue );
1186 
1187 #ifdef FLA_ENABLE_MULTITHREADING
1188  FLA_Lock_acquire( &(args->run_lock[q]) ); // R ***
1189 #endif
1190 
1191  // If there are tasks that this thread can steal.
1192  if ( args->wait_queue[q].n_tasks > 0 )
1193  {
1194  // Dequeue the last task.
1195  t = args->wait_queue[q].tail;
1196 
1197  if ( args->wait_queue[q].n_tasks == 1 )
1198  {
1199  // Clear the queue of its only task.
1200  args->wait_queue[q].head = NULL;
1201  args->wait_queue[q].tail = NULL;
1202  }
1203  else
1204  {
1205  // Adjust pointers in waiting queue.
1206  args->wait_queue[q].tail = t->prev_wait;
1207  args->wait_queue[q].tail->next_wait = NULL;
1208  }
1209 
1210  // Reset waiting queue data about the stolen task.
1211  t->queue = queue;
1212  t->prev_wait = NULL;
1213  t->next_wait = NULL;
1214 
1215  args->wait_queue[q].n_tasks--;
1216  }
1217 
1218 #ifdef FLA_ENABLE_MULTITHREADING
1219  FLA_Lock_release( &(args->run_lock[q]) ); // R ***
1220 #endif
1221 
1222  return t;
1223 }
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLASH_Task * prev_wait
Definition: FLA_type_defs.h:240
Definition: FLA_type_defs.h:183
FLASH_Task * next_wait
Definition: FLA_type_defs.h:241
FLASH_Task * tail
Definition: FLA_type_defs.h:180
FLA_Lock * run_lock
Definition: FLASH_Queue_exec.c:62
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
FLASH_Task * head
Definition: FLA_type_defs.h:179
int queue
Definition: FLA_type_defs.h:190
Definition: FLASH_Queue_exec.c:54
FLASH_Queue * wait_queue
Definition: FLASH_Queue_exec.c:92
unsigned int n_tasks
Definition: FLA_type_defs.h:176
int n_queues
Definition: FLASH_Queue_exec.c:77

◆ FLASH_Task_free_parallel()

void FLASH_Task_free_parallel ( FLASH_Task t,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::fla_arg, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_num_threads(), i, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Dep_s::next_dep, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Queue_variables::war_lock, and FLA_Obj_struct::write_task.

Referenced by FLASH_Queue_exec_parallel_function().

2452 {
2453  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2454  int i, j, k;
2455  int thread;
2456  int n_threads = FLASH_Queue_get_num_threads();
2457  FLASH_Dep* d;
2458  FLASH_Dep* next_dep;
2459  FLA_Obj obj;
2460 
2461  // Clearing the last write task in each output block.
2462  for ( i = 0; i < t->n_output_args; i++ )
2463  {
2464  obj = t->output_arg[i];
2465 
2466  // Macroblock is used.
2467  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
2468  {
2469  dim_t jj, kk;
2470  dim_t m = FLA_Obj_length( obj );
2471  dim_t n = FLA_Obj_width( obj );
2472  dim_t cs = FLA_Obj_col_stride( obj );
2473  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
2474 
2475  // Clear each block in macroblock.
2476  for ( jj = 0; jj < n; jj++ )
2477  for ( kk = 0; kk < m; kk++ )
2478  ( buf + jj * cs + kk )->base->write_task = NULL;
2479  }
2480  else // Clear regular block.
2481  {
2482  obj.base->write_task = NULL;
2483  }
2484  }
2485 
2486  // Cleaning the last read tasks in each input block.
2487  for ( i = 0; i < t->n_input_args; i++ )
2488  {
2489  obj = t->input_arg[i];
2490 
2491  // Macroblock is used.
2492  if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
2493  {
2494  dim_t jj, kk;
2495  dim_t m = FLA_Obj_length( obj );
2496  dim_t n = FLA_Obj_width( obj );
2497  dim_t cs = FLA_Obj_col_stride( obj );
2498  FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
2499 
2500  // Clear each block in macroblock.
2501  for ( jj = 0; jj < n; jj++ )
2502  {
2503  for ( kk = 0; kk < m; kk++ )
2504  {
2505  obj = *( buf + jj * cs + kk );
2506 
2507  thread = obj.base->n_read_blocks % n_threads;
2508 
2509  FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***
2510 
2511  k = obj.base->n_read_tasks;
2512  d = obj.base->read_task_head;
2513 
2514  obj.base->n_read_tasks = 0;
2515  obj.base->read_task_head = NULL;
2516  obj.base->read_task_tail = NULL;
2517 
2518  FLA_Lock_release( &(args->war_lock[thread]) ); // W ***
2519 
2520  for ( j = 0; j < k; j++ )
2521  {
2522  next_dep = d->next_dep;
2523  FLA_free( d );
2524  d = next_dep;
2525  }
2526  }
2527  }
2528  }
2529  else // Regular block.
2530  {
2531  thread = obj.base->n_read_blocks % n_threads;
2532 
2533  FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***
2534 
2535  k = obj.base->n_read_tasks;
2536  d = obj.base->read_task_head;
2537 
2538  obj.base->n_read_tasks = 0;
2539  obj.base->read_task_head = NULL;
2540  obj.base->read_task_tail = NULL;
2541 
2542  FLA_Lock_release( &(args->war_lock[thread]) ); // W ***
2543 
2544  for ( j = 0; j < k; j++ )
2545  {
2546  next_dep = d->next_dep;
2547  FLA_free( d );
2548  d = next_dep;
2549  }
2550  }
2551  }
2552 
2553  // Free the dep_arg field of t.
2554  d = t->dep_arg_head;
2555 
2556  for ( i = 0; i < t->n_dep_args; i++ )
2557  {
2558  next_dep = d->next_dep;
2559  FLA_free( d );
2560  d = next_dep;
2561  }
2562 
2563  // Free the int_arg field of t.
2564  FLA_free( t->int_arg );
2565 
2566  // Free the fla_arg field of t.
2567  FLA_free( t->fla_arg );
2568 
2569  // Free the input_arg field of t.
2570  FLA_free( t->input_arg );
2571 
2572  // Free the output_arg field of t.
2573  FLA_free( t->output_arg );
2574 
2575  // Finally, free the struct itself.
2576  FLA_free( t );
2577 
2578  return;
2579 }
int * int_arg
Definition: FLA_type_defs.h:210
FLASH_Dep * dep_arg_head
Definition: FLA_type_defs.h:232
unsigned long dim_t
Definition: FLA_type_defs.h:71
FLASH_Dep * read_task_tail
Definition: FLA_type_defs.h:151
FLA_Lock * war_lock
Definition: FLASH_Queue_exec.c:70
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
Definition: FLA_type_defs.h:244
FLA_Base_obj * base
Definition: FLA_type_defs.h:168
int n_read_tasks
Definition: FLA_type_defs.h:149
Definition: FLA_type_defs.h:158
dim_t FLA_Obj_width(FLA_Obj obj)
Definition: FLA_Query.c:123
void FLA_free(void *ptr)
Definition: FLA_Memory.c:247
FLASH_Dep * next_dep
Definition: FLA_type_defs.h:250
int n_dep_args
Definition: FLA_type_defs.h:231
int n_input_args
Definition: FLA_type_defs.h:217
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
int n_output_args
Definition: FLA_type_defs.h:221
FLA_Obj * output_arg
Definition: FLA_type_defs.h:222
Definition: FLASH_Queue_exec.c:54
FLASH_Task * write_task
Definition: FLA_type_defs.h:154
FLA_Obj * input_arg
Definition: FLA_type_defs.h:218
dim_t FLA_Obj_col_stride(FLA_Obj obj)
Definition: FLA_Query.c:174
int i
Definition: bl1_axmyv2.c:145
int n_read_blocks
Definition: FLA_type_defs.h:145
FLA_Obj * fla_arg
Definition: FLA_type_defs.h:214
dim_t FLA_Obj_length(FLA_Obj obj)
Definition: FLA_Query.c:116
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223
FLA_Elemtype FLA_Obj_elemtype(FLA_Obj obj)
Definition: FLA_Query.c:51
FLASH_Dep * read_task_head
Definition: FLA_type_defs.h:150

◆ FLASH_Task_update_binding()

FLASH_Task* FLASH_Task_update_binding ( FLASH_Task t,
FLASH_Task r,
void *  arg 
)

References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_sorting(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::height, FLASH_Task_s::hit, FLASH_Task_s::queue, and FLASH_Queue_variables::run_lock.

Referenced by FLASH_Task_update_dependencies().

2401 {
2402  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2403  int queue;
2404 
2405  if ( r == NULL )
2406  {
2407  // There are no tasks on waiting queue, so bind the first task.
2408  r = t;
2409  r->hit = TRUE;
2410  }
2411  else
2412  {
2413  // Swap the binded task for the new ready task.
2414  if ( !r->hit || ( FLASH_Queue_get_sorting() && r->height < t->height ) )
2415  {
2416  queue = r->queue;
2417  r->hit = FALSE;
2418 
2419  FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2420 
2421  // Place swapped task back onto waiting queue.
2422  FLASH_Queue_wait_enqueue( r, arg );
2423 
2424  FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2425 
2426  // Bind the new ready task.
2427  r = t;
2428  r->hit = TRUE;
2429  }
2430  else // Keep the binded task and enqueue new ready task.
2431  {
2432  queue = t->queue;
2433 
2434  FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2435 
2436  FLASH_Queue_wait_enqueue( t, arg );
2437 
2438  FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2439  }
2440  }
2441 
2442  return r;
2443 }
int height
Definition: FLA_type_defs.h:191
FLA_Bool FLASH_Queue_get_sorting(void)
Definition: FLASH_Queue.c:332
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
FLA_Lock * run_lock
Definition: FLASH_Queue_exec.c:62
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
int queue
Definition: FLA_type_defs.h:190
FLA_Bool hit
Definition: FLA_type_defs.h:194
Definition: FLASH_Queue_exec.c:54
void FLASH_Queue_wait_enqueue(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:626

◆ FLASH_Task_update_dependencies()

FLASH_Task* FLASH_Task_update_dependencies ( FLASH_Task t,
void *  arg 
)

References FLASH_Task_s::cache, FLASH_Task_s::dep_arg_head, FLASH_Queue_variables::dep_lock, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_caching(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_update_binding(), i, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Dep_s::next_dep, FLASH_Task_s::order, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::run_lock, and FLASH_Dep_s::task.

Referenced by FLASH_Queue_exec_parallel_function().

2322 {
2323  FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2324  int i;
2325  int q = t->queue;
2326  int queue;
2327  int thread;
2328  int n_threads = FLASH_Queue_get_num_threads();
2329  FLA_Bool caching = FLASH_Queue_get_caching();
2331  FLA_Bool available;
2332  FLASH_Task* task;
2333  FLASH_Task* r = NULL;
2334  FLASH_Dep* d = t->dep_arg_head;
2335 
2336  // Dequeue task to bind to thread if caching is enabled.
2337  if ( caching )
2338  {
2339  FLA_Lock_acquire( &(args->run_lock[q]) ); // R ***
2340 
2341  // Obtain task to execute.
2342  r = FLASH_Queue_wait_dequeue( q, t->cache, arg );
2343 
2344  FLA_Lock_release( &(args->run_lock[q]) ); // R ***
2345  }
2346 
2347  // Check each dependent task.
2348  for ( i = 0; i < t->n_dep_args; i++ )
2349  {
2350  if ( stealing )
2351  {
2352  // Place all dependent tasks onto same queue as predecessor task.
2353  d->task->queue = q;
2354  }
2355 
2356  task = d->task;
2357  queue = task->queue;
2358  thread = task->order % n_threads;
2359 
2360  FLA_Lock_acquire( &(args->dep_lock[thread]) ); // D ***
2361 
2362  task->n_ready--;
2363  available = ( task->n_ready == 0 );
2364 
2365  FLA_Lock_release( &(args->dep_lock[thread]) ); // D ***
2366 
2367  // Place newly ready tasks on waiting queue.
2368  if ( available )
2369  {
2370  // If caching is enabled and the task belongs to this thread's queue.
2371  if ( caching && q == queue )
2372  {
2373  // Determine if there is a new binded task.
2374  r = FLASH_Task_update_binding( task, r, arg );
2375  }
2376  else
2377  {
2378  FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2379 
2380  FLASH_Queue_wait_enqueue( task, arg );
2381 
2382  FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2383  }
2384  }
2385 
2386  // Go to the next dep.
2387  d = d->next_dep;
2388  }
2389 
2390  return r;
2391 }
FLASH_Dep * dep_arg_head
Definition: FLA_type_defs.h:232
FLA_Bool FLASH_Queue_get_caching(void)
Definition: FLASH_Queue.c:356
void FLA_Lock_release(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:58
Definition: FLA_type_defs.h:244
int n_ready
Definition: FLA_type_defs.h:186
Definition: FLA_type_defs.h:183
FLASH_Task * task
Definition: FLA_type_defs.h:247
FLASH_Dep * next_dep
Definition: FLA_type_defs.h:250
int n_dep_args
Definition: FLA_type_defs.h:231
FLA_Lock * run_lock
Definition: FLASH_Queue_exec.c:62
void FLA_Lock_acquire(FLA_Lock *fla_lock_ptr)
Definition: FLA_Lock.c:43
int queue
Definition: FLA_type_defs.h:190
FLA_Bool FLASH_Queue_get_work_stealing(void)
Definition: FLASH_Queue.c:380
int order
Definition: FLA_type_defs.h:189
Definition: FLASH_Queue_exec.c:54
FLASH_Task * FLASH_Queue_wait_dequeue(int queue, int cache, void *arg)
Definition: FLASH_Queue_exec.c:678
void FLASH_Queue_wait_enqueue(FLASH_Task *t, void *arg)
Definition: FLASH_Queue_exec.c:626
int FLA_Bool
Definition: FLA_type_defs.h:46
int i
Definition: bl1_axmyv2.c:145
FLASH_Task * FLASH_Task_update_binding(FLASH_Task *t, FLASH_Task *r, void *arg)
Definition: FLASH_Queue_exec.c:2394
int cache
Definition: FLA_type_defs.h:193
unsigned int FLASH_Queue_get_num_threads(void)
Definition: FLASH_Queue.c:223
FLA_Lock * dep_lock
Definition: FLASH_Queue_exec.c:66

◆ RCCE_acquire_lock()

int RCCE_acquire_lock ( int  )

◆ RCCE_release_lock()

int RCCE_release_lock ( int  )

◆ RCCE_ue()

int RCCE_ue ( void  )

◆ RCCE_wtime()

double RCCE_wtime ( void  )

◆ Synch_all()

void Synch_all ( )