Blender  V3.3
path_trace_work_gpu.cpp
Go to the documentation of this file.
1 /* SPDX-License-Identifier: Apache-2.0
2  * Copyright 2011-2022 Blender Foundation */
3 
6 
7 #include "device/device.h"
8 
10 #include "scene/scene.h"
11 #include "session/buffers.h"
12 #include "util/log.h"
13 #include "util/string.h"
14 #include "util/tbb.h"
15 #include "util/time.h"
16 
17 #include "kernel/types.h"
18 
20 
22 {
23  size_t state_size = 0;
24 
25 #define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
26 #define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type);
27 #define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type);
28 #define KERNEL_STRUCT_END(name) \
29  break; \
30  }
31 #define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
32  if (array_index >= gpu_array_size - 1) { \
33  break; \
34  } \
35  }
36 /* TODO(sergey): Look into better estimation for fields which depend on scene features. Maybe
37  * maximum state calculation should happen as `alloc_work_memory()`, so that we can react to an
38  * updated scene state here.
39  * For until then use common value. Currently this size is only used for logging, but is weak to
40  * rely on this. */
41 #define KERNEL_STRUCT_VOLUME_STACK_SIZE 4
42 
44 
46 
47 #undef KERNEL_STRUCT_BEGIN
48 #undef KERNEL_STRUCT_MEMBER
49 #undef KERNEL_STRUCT_ARRAY_MEMBER
50 #undef KERNEL_STRUCT_END
51 #undef KERNEL_STRUCT_END_ARRAY
52 #undef KERNEL_STRUCT_VOLUME_STACK_SIZE
53 
54  return state_size;
55 }
56 
58  Film *film,
59  DeviceScene *device_scene,
60  bool *cancel_requested_flag)
61  : PathTraceWork(device, film, device_scene, cancel_requested_flag),
62  queue_(device->gpu_queue_create()),
63  integrator_state_soa_kernel_features_(0),
64  integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE),
65  integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
66  integrator_shader_raytrace_sort_counter_(
67  device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
68  integrator_shader_mnee_sort_counter_(
69  device, "integrator_shader_mnee_sort_counter", MEM_READ_WRITE),
70  integrator_shader_sort_prefix_sum_(
71  device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
72  integrator_next_main_path_index_(device, "integrator_next_main_path_index", MEM_READ_WRITE),
73  integrator_next_shadow_path_index_(
74  device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
75  queued_paths_(device, "queued_paths", MEM_READ_WRITE),
76  num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
77  work_tiles_(device, "work_tiles", MEM_READ_WRITE),
78  display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
79  max_num_paths_(queue_->num_concurrent_states(estimate_single_state_size())),
80  min_num_active_main_paths_(queue_->num_concurrent_busy_states()),
81  max_active_main_path_index_(0)
82 {
83  memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
84 
85  /* Limit number of active paths to the half of the overall state. This is due to the logic in the
86  * path compaction which relies on the fact that regeneration does not happen sooner than half of
87  * the states are available again. */
89 }
90 
92 {
93  /* IntegrateState allocated as structure of arrays. */
94 
95  /* Check if we already allocated memory for the required features. */
96  const int requested_volume_stack_size = device_scene_->data.volume_stack_size;
97  const uint kernel_features = device_scene_->data.kernel_features;
98  if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features &&
99  integrator_state_soa_volume_stack_size_ >= requested_volume_stack_size) {
100  return;
101  }
102  integrator_state_soa_kernel_features_ = kernel_features;
104  requested_volume_stack_size);
105 
106  /* Allocate a device only memory buffer before for each struct member, and then
107  * write the pointers into a struct that resides in constant memory.
108  *
109  * TODO: store float3 in separate XYZ arrays. */
110 #define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
111 #define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
112  if ((kernel_features & (feature)) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
113  device_only_memory<type> *array = new device_only_memory<type>(device_, \
114  "integrator_state_" #name); \
115  array->alloc_to_device(max_num_paths_); \
116  integrator_state_soa_.emplace_back(array); \
117  integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \
118  }
119 #define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
120  if ((kernel_features & (feature)) && \
121  (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) { \
122  device_only_memory<type> *array = new device_only_memory<type>(device_, \
123  "integrator_state_" #name); \
124  array->alloc_to_device(max_num_paths_); \
125  integrator_state_soa_.emplace_back(array); \
126  integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \
127  }
128 #define KERNEL_STRUCT_END(name) \
129  break; \
130  }
131 #define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
132  if (array_index >= gpu_array_size - 1) { \
133  break; \
134  } \
135  }
136 #define KERNEL_STRUCT_VOLUME_STACK_SIZE (integrator_state_soa_volume_stack_size_)
137 
139 
141 
142 #undef KERNEL_STRUCT_BEGIN
143 #undef KERNEL_STRUCT_MEMBER
144 #undef KERNEL_STRUCT_ARRAY_MEMBER
145 #undef KERNEL_STRUCT_END
146 #undef KERNEL_STRUCT_END_ARRAY
147 #undef KERNEL_STRUCT_VOLUME_STACK_SIZE
148 
149  if (VLOG_IS_ON(3)) {
150  size_t total_soa_size = 0;
151  for (auto &&soa_memory : integrator_state_soa_) {
152  total_soa_size += soa_memory->memory_size();
153  }
154 
155  VLOG_DEVICE_STATS << "GPU SoA state size: " << string_human_readable_size(total_soa_size);
156  }
157 }
158 
160 {
161  if (integrator_queue_counter_.size() == 0) {
167  }
168 
169  /* Allocate data for active path index arrays. */
170  if (num_queued_paths_.size() == 0) {
173  }
174 
175  if (queued_paths_.size() == 0) {
177  /* TODO: this could be skip if we had a function to just allocate on device. */
179  }
180 }
181 
183 {
184  /* Compute sort partitions, to balance between memory locality and coherence.
185  * Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a
186  * more sophisticated heuristic we simply disable sort partitioning if the shader count is high.
187  */
189  if (device_scene_->data.max_shaders < 300) {
190  const int num_elements = queue_->num_sort_partition_elements();
191  if (num_elements) {
192  num_sort_partitions_ = max(max_num_paths_ / num_elements, 1);
193  }
194  }
195 
198 
199  /* Allocate arrays for shader sorting. */
200  const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
201  if (integrator_shader_sort_counter_.size() < sort_buckets) {
206 
209  }
210 
212  if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
217  }
218  }
219 
221  if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
226  }
227  }
228 }
229 
231 {
235 
238  }
239 
244 
247  }
248 }
249 
251 {
256 }
257 
259 {
260  queue_->init_execution();
261 
262  /* Copy to device side struct in constant memory. */
264  "integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
265 }
266 
268  int start_sample,
269  int samples_num,
270  int sample_offset)
271 {
272  /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
273  * add more work (because tiles are smaller, so there is higher chance that more paths will
274  * become busy after adding new tiles). This is especially important for the shadow catcher which
275  * schedules work in halves of available number of paths. */
278  0);
280  start_sample,
281  samples_num,
282  sample_offset,
283  device_scene_->data.integrator.scrambling_distance);
284 
285  enqueue_reset();
286 
287  int num_iterations = 0;
288  uint64_t num_busy_accum = 0;
289 
290  /* TODO: set a hard limit in case of undetected kernel failures? */
291  while (true) {
292  /* Enqueue work from the scheduler, on start or when there are not enough
293  * paths to keep the device occupied. */
294  bool finished;
295  if (enqueue_work_tiles(finished)) {
296  /* Copy stats from the device. */
297  queue_->copy_from_device(integrator_queue_counter_);
298 
299  if (!queue_->synchronize()) {
300  break; /* Stop on error. */
301  }
302  }
303 
304  if (is_cancel_requested()) {
305  break;
306  }
307 
308  /* Stop if no more work remaining. */
309  if (finished) {
310  break;
311  }
312 
313  /* Enqueue on of the path iteration kernels. */
314  if (enqueue_path_iteration()) {
315  /* Copy stats from the device. */
316  queue_->copy_from_device(integrator_queue_counter_);
317 
318  if (!queue_->synchronize()) {
319  break; /* Stop on error. */
320  }
321  }
322 
323  if (is_cancel_requested()) {
324  break;
325  }
326 
327  num_busy_accum += num_active_main_paths_paths();
328  ++num_iterations;
329  }
330 
331  statistics.occupancy = static_cast<float>(num_busy_accum) / num_iterations / max_num_paths_;
332 }
333 
335 {
336  const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
337 
338  int max_num_queued = 0;
340 
341  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
342  if (queue_counter->num_queued[i] > max_num_queued) {
343  kernel = (DeviceKernel)i;
344  max_num_queued = queue_counter->num_queued[i];
345  }
346  }
347 
348  return kernel;
349 }
350 
352 {
354 
356  queue_->zero_to_device(integrator_queue_counter_);
357  queue_->zero_to_device(integrator_shader_sort_counter_);
360  }
363  }
364 
365  /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
366  * counter on the host side because `zero_to_device()` is not doing it. */
369  }
370 }
371 
373 {
374  /* Find kernel to execute, with max number of queued paths. */
375  const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
376 
377  int num_active_paths = 0;
378  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
379  num_active_paths += queue_counter->num_queued[i];
380  }
381 
382  if (num_active_paths == 0) {
383  return false;
384  }
385 
386  /* Find kernel to execute, with max number of queued paths. */
388  if (kernel == DEVICE_KERNEL_NUM) {
389  return false;
390  }
391 
392  /* For kernels that add shadow paths, check if there is enough space available.
393  * If not, schedule shadow kernels first to clear out the shadow paths. */
394  int num_paths_limit = INT_MAX;
395 
398 
399  const int available_shadow_paths = max_num_paths_ -
401  if (available_shadow_paths < queue_counter->num_queued[kernel]) {
404  return true;
405  }
406  else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) {
408  return true;
409  }
410  }
411  else if (kernel_creates_ao_paths(kernel)) {
412  /* AO kernel creates two shadow paths, so limit number of states to schedule. */
413  num_paths_limit = available_shadow_paths / 2;
414  }
415  }
416 
417  /* Schedule kernel with maximum number of queued items. */
418  enqueue_path_iteration(kernel, num_paths_limit);
419 
420  /* Update next shadow path index for kernels that can add shadow paths. */
422  queue_->copy_from_device(integrator_next_shadow_path_index_);
423  }
424 
425  return true;
426 }
427 
429 {
430  device_ptr d_path_index = 0;
431 
432  /* Create array of path indices for which this kernel is queued to be executed. */
434 
436  int num_queued = queue_counter->num_queued[kernel];
437 
439  /* Compute array of active paths, sorted by shader. */
440  work_size = num_queued;
441  d_path_index = queued_paths_.device_pointer;
442 
445  }
446  else if (num_queued < work_size) {
447  work_size = num_queued;
448  d_path_index = queued_paths_.device_pointer;
449 
451  /* Compute array of active shadow paths for specific kernel. */
453  }
454  else {
455  /* Compute array of active paths for specific kernel. */
457  }
458  }
459 
460  work_size = min(work_size, num_paths_limit);
461 
463 
464  switch (kernel) {
466  /* Closest ray intersection kernels with integrator state and render buffer. */
467  DeviceKernelArguments args(&d_path_index, &buffers_->buffer.device_pointer, &work_size);
468 
469  queue_->enqueue(kernel, work_size, args);
470  break;
471  }
472 
476  /* Ray intersection kernels with integrator state. */
477  DeviceKernelArguments args(&d_path_index, &work_size);
478 
479  queue_->enqueue(kernel, work_size, args);
480  break;
481  }
489  /* Shading kernels with integrator state and render buffer. */
490  DeviceKernelArguments args(&d_path_index, &buffers_->buffer.device_pointer, &work_size);
491 
492  queue_->enqueue(kernel, work_size, args);
493  break;
494  }
495 
496  default:
497  LOG(FATAL) << "Unhandled kernel " << device_kernel_as_string(kernel)
498  << " used for path iteration, should never happen.";
499  break;
500  }
501 }
502 
504  DeviceKernel queued_kernel,
505  const int num_paths_limit)
506 {
507  int d_queued_kernel = queued_kernel;
508  device_ptr d_counter = (device_ptr)integrator_state_gpu_.sort_key_counter[d_queued_kernel];
510  assert(d_counter != 0 && d_prefix_sum != 0);
511 
512  /* Compute prefix sum of number of active paths with each shader. */
513  {
514  const int work_size = 1;
515  int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
516 
517  DeviceKernelArguments args(&d_counter, &d_prefix_sum, &sort_buckets);
518 
519  queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
520  }
521 
522  queue_->zero_to_device(num_queued_paths_);
523 
524  /* Launch kernel to fill the active paths arrays. */
525  {
526  /* TODO: this could be smaller for terminated paths based on amount of work we want
527  * to schedule, and also based on num_paths_limit.
528  *
529  * Also, when the number paths is limited it may be better to prefer paths from the
530  * end of the array since compaction would need to do less work. */
531  const int work_size = kernel_max_active_main_path_index(queued_kernel);
532 
533  device_ptr d_queued_paths = queued_paths_.device_pointer;
534  device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
535 
537  &num_paths_limit,
538  &d_queued_paths,
539  &d_num_queued_paths,
540  &d_counter,
541  &d_prefix_sum,
542  &d_queued_kernel);
543 
544  queue_->enqueue(kernel, work_size, args);
545  }
546 }
547 
549 {
550  int d_queued_kernel = queued_kernel;
551 
552  /* Launch kernel to fill the active paths arrays. */
553  const int work_size = kernel_max_active_main_path_index(queued_kernel);
554  device_ptr d_queued_paths = queued_paths_.device_pointer;
555  device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
556 
557  DeviceKernelArguments args(&work_size, &d_queued_paths, &d_num_queued_paths, &d_queued_kernel);
558 
559  queue_->zero_to_device(num_queued_paths_);
560  queue_->enqueue(kernel, work_size, args);
561 }
562 
564 {
565  /* Early out if there is nothing that needs to be compacted. */
566  if (num_active_paths == 0) {
568  return;
569  }
570 
571  const int min_compact_paths = 32;
573  max_active_main_path_index_ < min_compact_paths) {
574  return;
575  }
576 
577  /* Compact. */
583 
584  /* Adjust max active path index now we know which part of the array is actually used. */
586 }
587 
589 {
591  const int num_active_paths =
594 
595  /* Early out if there is nothing that needs to be compacted. */
596  if (num_active_paths == 0) {
600  }
601  return;
602  }
603 
604  /* Compact if we can reduce the space used by half. Not always since
605  * compaction has a cost. */
606  const float shadow_compact_ratio = 0.5f;
607  const int min_compact_paths = 32;
608  if (integrator_next_shadow_path_index_.data()[0] < num_active_paths * shadow_compact_ratio ||
609  integrator_next_shadow_path_index_.data()[0] < min_compact_paths) {
610  return;
611  }
612 
613  /* Compact. */
619 
620  /* Adjust max active path index now we know which part of the array is actually used. */
623 }
624 
626  const int max_active_path_index,
627  DeviceKernel terminated_paths_kernel,
628  DeviceKernel compact_paths_kernel,
629  DeviceKernel compact_kernel)
630 {
631  /* Compact fragmented path states into the start of the array, moving any paths
632  * with index higher than the number of active paths into the gaps. */
633  device_ptr d_compact_paths = queued_paths_.device_pointer;
634  device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
635 
636  /* Create array with terminated paths that we can write to. */
637  {
638  /* TODO: can the work size be reduced here? */
639  int offset = num_active_paths;
641 
642  DeviceKernelArguments args(&work_size, &d_compact_paths, &d_num_queued_paths, &offset);
643 
644  queue_->zero_to_device(num_queued_paths_);
645  queue_->enqueue(terminated_paths_kernel, work_size, args);
646  }
647 
648  /* Create array of paths that we need to compact, where the path index is bigger
649  * than the number of active paths. */
650  {
651  int work_size = max_active_path_index;
652 
654  &work_size, &d_compact_paths, &d_num_queued_paths, &num_active_paths);
655 
656  queue_->zero_to_device(num_queued_paths_);
657  queue_->enqueue(compact_paths_kernel, work_size, args);
658  }
659 
660  queue_->copy_from_device(num_queued_paths_);
661  queue_->synchronize();
662 
663  int num_compact_paths = num_queued_paths_.data()[0];
664 
665  /* Move paths into gaps. */
666  if (num_compact_paths > 0) {
667  int work_size = num_compact_paths;
668  int active_states_offset = 0;
670 
673 
674  queue_->enqueue(compact_kernel, work_size, args);
675  }
676 }
677 
679 {
680  /* If there are existing paths wait them to go to intersect closest kernel, which will align the
681  * wavefront of the existing and newly added paths. */
682  /* TODO: Check whether counting new intersection kernels here will have positive affect on the
683  * performance. */
686  return false;
687  }
688 
690 
691  /* Don't schedule more work if canceling. */
692  if (is_cancel_requested()) {
693  if (num_active_paths == 0) {
694  finished = true;
695  }
696  return false;
697  }
698 
699  finished = false;
700 
701  vector<KernelWorkTile> work_tiles;
702 
703  int max_num_camera_paths = max_num_paths_;
704  int num_predicted_splits = 0;
705 
706  if (has_shadow_catcher()) {
707  /* When there are shadow catchers in the scene bounce from them will split the state. So we
708  * make sure there is enough space in the path states array to fit split states.
709  *
710  * Basically, when adding N new paths we ensure that there is 2*N available path states, so
711  * that all the new paths can be split.
712  *
713  * Note that it is possible that some of the current states can still split, so need to make
714  * sure there is enough space for them as well. */
715 
716  /* Number of currently in-flight states which can still split. */
717  const int num_scheduled_possible_split = shadow_catcher_count_possible_splits();
718 
719  const int num_available_paths = max_num_paths_ - num_active_paths;
720  const int num_new_paths = num_available_paths / 2;
721  max_num_camera_paths = max(num_active_paths,
722  num_active_paths + num_new_paths - num_scheduled_possible_split);
723  num_predicted_splits += num_scheduled_possible_split + num_new_paths;
724  }
725 
726  /* Schedule when we're out of paths or there are too few paths to keep the
727  * device occupied. */
728  int num_paths = num_active_paths;
729  if (num_paths == 0 || num_paths < min_num_active_main_paths_) {
730  /* Get work tiles until the maximum number of path is reached. */
731  while (num_paths < max_num_camera_paths) {
732  KernelWorkTile work_tile;
733  if (work_tile_scheduler_.get_work(&work_tile, max_num_camera_paths - num_paths)) {
734  work_tiles.push_back(work_tile);
735  num_paths += work_tile.w * work_tile.h * work_tile.num_samples;
736  }
737  else {
738  break;
739  }
740  }
741 
742  /* If we couldn't get any more tiles, we're done. */
743  if (work_tiles.size() == 0 && num_paths == 0) {
744  finished = true;
745  return false;
746  }
747  }
748 
749  /* Initialize paths from work tiles. */
750  if (work_tiles.size() == 0) {
751  return false;
752  }
753 
754  /* Compact state array when number of paths becomes small relative to the
755  * known maximum path index, which makes computing active index arrays slow. */
757 
758  if (has_shadow_catcher()) {
759  integrator_next_main_path_index_.data()[0] = num_paths;
760  queue_->copy_to_device(integrator_next_main_path_index_);
761  }
762 
765  work_tiles.data(),
766  work_tiles.size(),
768  num_predicted_splits);
769 
770  return true;
771 }
772 
774  const KernelWorkTile work_tiles[],
775  const int num_work_tiles,
776  const int num_active_paths,
777  const int num_predicted_splits)
778 {
779  /* Copy work tiles to device. */
780  if (work_tiles_.size() < num_work_tiles) {
781  work_tiles_.alloc(num_work_tiles);
782  }
783 
784  int path_index_offset = num_active_paths;
785  int max_tile_work_size = 0;
786  for (int i = 0; i < num_work_tiles; i++) {
787  KernelWorkTile &work_tile = work_tiles_.data()[i];
788  work_tile = work_tiles[i];
789 
790  const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
791 
792  work_tile.path_index_offset = path_index_offset;
793  work_tile.work_size = tile_work_size;
794 
795  path_index_offset += tile_work_size;
796 
797  max_tile_work_size = max(max_tile_work_size, tile_work_size);
798  }
799 
800  queue_->copy_to_device(work_tiles_);
801 
802  device_ptr d_work_tiles = work_tiles_.device_pointer;
803  device_ptr d_render_buffer = buffers_->buffer.device_pointer;
804 
805  /* Launch kernel. */
807  &d_work_tiles, &num_work_tiles, &d_render_buffer, &max_tile_work_size);
808 
809  queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args);
810 
811  max_active_main_path_index_ = path_index_offset + num_predicted_splits;
812 }
813 
815 {
817 
818  int num_paths = 0;
819  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
820  DCHECK_GE(queue_counter->num_queued[i], 0)
821  << "Invalid number of queued states for kernel "
822  << device_kernel_as_string(static_cast<DeviceKernel>(i));
823 
825  num_paths += queue_counter->num_queued[i];
826  }
827  }
828 
829  return num_paths;
830 }
831 
833 {
834  /* There are few aspects with the graphics interop when using multiple devices caused by the fact
835  * that the PathTraceDisplay has a single texture:
836  *
837  * CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when
838  * attempting to register OpenGL PBO which has been mapped. Which makes sense, because
839  * otherwise one would run into a conflict of where the source of truth is. */
840  if (has_multiple_works()) {
841  return false;
842  }
843 
844  if (!interop_use_checked_) {
845  Device *device = queue_->device;
847 
848  if (interop_use_) {
849  VLOG_INFO << "Using graphics interop GPU display update.";
850  }
851  else {
852  VLOG_INFO << "Using naive GPU display update.";
853  }
854 
855  interop_use_checked_ = true;
856  }
857 
858  return interop_use_;
859 }
860 
862  PassMode pass_mode,
863  int num_samples)
864 {
865  if (device_->have_error()) {
866  /* Don't attempt to update GPU display if the device has errors: the error state will make
867  * wrong decisions to happen about interop, causing more chained bugs. */
868  return;
869  }
870 
871  if (!buffers_->buffer.device_pointer) {
872  LOG(WARNING) << "Request for GPU display update without allocated render buffers.";
873  return;
874  }
875 
877  if (copy_to_display_interop(display, pass_mode, num_samples)) {
878  return;
879  }
880 
881  /* If error happens when trying to use graphics interop fallback to the native implementation
882  * and don't attempt to use interop for the further updates. */
883  interop_use_ = false;
884  }
885 
886  copy_to_display_naive(display, pass_mode, num_samples);
887 }
888 
890  PassMode pass_mode,
891  int num_samples)
892 {
897  const int final_width = buffers_->params.window_width;
898  const int final_height = buffers_->params.window_height;
899 
900  const int texture_x = full_x - effective_big_tile_params_.full_x +
902  const int texture_y = full_y - effective_big_tile_params_.full_y +
904 
905  /* Re-allocate display memory if needed, and make sure the device pointer is allocated.
906  *
907  * NOTE: allocation happens to the final resolution so that no re-allocation happens on every
908  * change of the resolution divider. However, if the display becomes smaller, shrink the
909  * allocated memory as well. */
910  if (display_rgba_half_.data_width != final_width ||
911  display_rgba_half_.data_height != final_height) {
912  display_rgba_half_.alloc(final_width, final_height);
913  /* TODO(sergey): There should be a way to make sure device-side memory is allocated without
914  * transferring zeroes to the device. */
915  queue_->zero_to_device(display_rgba_half_);
916  }
917 
918  PassAccessor::Destination destination(film_->get_display_pass());
920 
921  get_render_tile_film_pixels(destination, pass_mode, num_samples);
922 
923  queue_->copy_from_device(display_rgba_half_);
924  queue_->synchronize();
925 
926  display->copy_pixels_to_texture(display_rgba_half_.data(), texture_x, texture_y, width, height);
927 }
928 
930  PassMode pass_mode,
931  int num_samples)
932 {
934  device_graphics_interop_ = queue_->graphics_interop_create();
935  }
936 
937  const DisplayDriver::GraphicsInterop graphics_interop_dst = display->graphics_interop_get();
938  device_graphics_interop_->set_display_interop(graphics_interop_dst);
939 
940  const device_ptr d_rgba_half = device_graphics_interop_->map();
941  if (!d_rgba_half) {
942  return false;
943  }
944 
946  destination.d_pixels_half_rgba = d_rgba_half;
947 
948  get_render_tile_film_pixels(destination, pass_mode, num_samples);
949 
950  device_graphics_interop_->unmap();
951 
952  return true;
953 }
954 
956 {
958  return;
959  }
960  display->graphics_interop_activate();
961  device_graphics_interop_ = nullptr;
962  display->graphics_interop_deactivate();
963 }
964 
966  PassMode pass_mode,
967  int num_samples)
968 {
969  const KernelFilm &kfilm = device_scene_->data.film;
970 
971  const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
972  const PassAccessorGPU pass_accessor(queue_.get(), pass_access_info, kfilm.exposure, num_samples);
973 
974  pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
975 }
976 
978 {
980 
981  if (num_active_pixels) {
984  queue_->synchronize();
985  }
986 
987  return num_active_pixels;
988 }
989 
991 {
993  num_active_pixels.alloc(1);
994 
995  queue_->zero_to_device(num_active_pixels);
996 
998 
999  DeviceKernelArguments args(&buffers_->buffer.device_pointer,
1004  &threshold,
1005  &reset,
1008  &num_active_pixels.device_pointer);
1009 
1011 
1012  queue_->copy_from_device(num_active_pixels);
1013  queue_->synchronize();
1014 
1015  return num_active_pixels.data()[0];
1016 }
1017 
1019 {
1021 
1022  DeviceKernelArguments args(&buffers_->buffer.device_pointer,
1029 
1031 }
1032 
1034 {
1036 
1037  DeviceKernelArguments args(&buffers_->buffer.device_pointer,
1044 
1046 }
1047 
1049 {
1051 
1052  DeviceKernelArguments args(&buffers_->buffer.device_pointer,
1053  &work_size,
1056 
1058 }
1059 
1061 {
1062  queue_->copy_from_device(buffers_->buffer);
1063 
1064  /* Synchronize so that the CPU-side buffer is available at the exit of this function. */
1065  return queue_->synchronize();
1066 }
1067 
1069 {
1070  queue_->copy_to_device(buffers_->buffer);
1071 
1072  /* NOTE: The direct device access to the buffers only happens within this path trace work. The
1073  * rest of communication happens via API calls which involves `copy_render_buffers_from_device()`
1074  * which will perform synchronization as needed. */
1075 
1076  return true;
1077 }
1078 
1080 {
1081  queue_->zero_to_device(buffers_->buffer);
1082 
1083  return true;
1084 }
1085 
1087 {
1088  return device_scene_->data.integrator.has_shadow_catcher;
1089 }
1090 
1092 {
1093  if (max_active_main_path_index_ == 0) {
1094  return 0;
1095  }
1096 
1097  if (!has_shadow_catcher()) {
1098  return 0;
1099  }
1100 
1101  queue_->zero_to_device(num_queued_paths_);
1102 
1104  device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
1105 
1106  DeviceKernelArguments args(&work_size, &d_num_queued_paths);
1107 
1109  queue_->copy_from_device(num_queued_paths_);
1110  queue_->synchronize();
1111 
1112  return num_queued_paths_.data()[0];
1113 }
1114 
1116 {
1120 }
1121 
1123 {
1128 }
1129 
1131 {
1136 }
1137 
1139 {
1142 }
1143 
1145 {
1148 }
1149 
unsigned int uint
Definition: BLI_sys_types.h:67
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei height
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei width
int offset
Definition: buffers.h:90
int full_x
Definition: buffers.h:84
int stride
Definition: buffers.h:90
int height
Definition: buffers.h:72
int window_y
Definition: buffers.h:79
int window_height
Definition: buffers.h:81
int window_width
Definition: buffers.h:80
NODE_DECLARE int width
Definition: buffers.h:71
int window_x
Definition: buffers.h:78
int full_y
Definition: buffers.h:85
KernelData data
Definition: scene.h:130
virtual BVHLayoutMask get_bvh_layout_mask() const =0
virtual void const_copy_to(const char *name, void *host, size_t size)=0
virtual bool should_use_graphics_interop()
bool have_error()
Definition: film.h:29
bool get_render_tile_pixels(const RenderBuffers *render_buffers, const Destination &destination) const
DisplayDriver::GraphicsInterop graphics_interop_get()
void copy_pixels_to_texture(const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height)
bool kernel_is_shadow_path(DeviceKernel kernel)
virtual bool copy_render_buffers_from_device() override
void compact_paths(const int num_active_paths, const int max_active_path_index, DeviceKernel terminated_paths_kernel, DeviceKernel compact_paths_kernel, DeviceKernel compact_kernel)
IntegratorStateGPU integrator_state_gpu_
device_vector< int > integrator_shader_sort_counter_
int integrator_state_soa_volume_stack_size_
uint integrator_state_soa_kernel_features_
PathTraceWorkGPU(Device *device, Film *film, DeviceScene *device_scene, bool *cancel_requested_flag)
device_vector< int > num_queued_paths_
bool copy_to_display_interop(PathTraceDisplay *display, PassMode pass_mode, int num_samples)
virtual void destroy_gpu_resources(PathTraceDisplay *display) override
virtual void alloc_work_memory() override
device_vector< int > integrator_next_main_path_index_
unique_ptr< DeviceQueue > queue_
int adaptive_sampling_convergence_check_count_active(float threshold, bool reset)
virtual bool zero_render_buffers() override
bool kernel_uses_sorting(DeviceKernel kernel)
virtual void init_execution() override
device_vector< int > integrator_shader_sort_prefix_sum_
device_vector< KernelWorkTile > work_tiles_
virtual void cryptomatte_postproces() override
void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel, const int num_paths_limit)
device_vector< IntegratorQueueCounter > integrator_queue_counter_
bool enqueue_work_tiles(bool &finished)
device_vector< int > queued_paths_
void compact_main_paths(const int num_active_paths)
bool has_shadow_catcher() const
bool kernel_creates_ao_paths(DeviceKernel kernel)
virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num, int sample_offset) override
unique_ptr< DeviceGraphicsInterop > device_graphics_interop_
device_vector< int > integrator_next_shadow_path_index_
void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
DeviceKernel get_most_queued_kernel() const
device_vector< int > integrator_shader_raytrace_sort_counter_
bool kernel_creates_shadow_paths(DeviceKernel kernel)
void copy_to_display_naive(PathTraceDisplay *display, PassMode pass_mode, int num_samples)
void get_render_tile_film_pixels(const PassAccessor::Destination &destination, PassMode pass_mode, int num_samples)
vector< unique_ptr< device_memory > > integrator_state_soa_
device_vector< int > integrator_shader_mnee_sort_counter_
WorkTileScheduler work_tile_scheduler_
virtual bool copy_render_buffers_to_device() override
virtual void copy_to_display(PathTraceDisplay *display, PassMode pass_mode, int num_samples) override
virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override
int kernel_max_active_main_path_index(DeviceKernel kernel)
device_vector< half4 > display_rgba_half_
unique_ptr< RenderBuffers > buffers_
PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const
BufferParams effective_big_tile_params_
PassAccessor::Destination get_display_destination_template(const PathTraceDisplay *display) const
bool has_multiple_works() const
BufferParams effective_buffer_params_
DeviceScene * device_scene_
bool is_cancel_requested() const
bool get_work(KernelWorkTile *work_tile, const int max_work_size=0)
void set_accelerated_rt(bool state)
void set_max_num_path_states(int max_num_path_states)
void reset(const BufferParams &buffer_params, int sample_start, int samples_num, int sample_offset, float scrambling_distance)
device_ptr device_pointer
T * alloc(size_t width, size_t height=0, size_t depth=0)
size_t size() const
#define CCL_NAMESPACE_END
Definition: cuda/compat.h:9
@ MEM_READ_WRITE
CCL_NAMESPACE_BEGIN const char * device_kernel_as_string(DeviceKernel kernel)
SyclQueue void void size_t num_bytes SyclQueue void const char void *memory_device_pointer KernelContext int kernel
ccl_gpu_kernel_postfix ccl_global float int full_x
ccl_gpu_kernel_postfix ccl_global const int ccl_global float const int work_size
ccl_gpu_kernel_postfix ccl_global const int const int active_states_offset
ccl_gpu_kernel_postfix ccl_global float int int int int float threshold
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float const int max_tile_work_size
ccl_gpu_kernel_postfix int ccl_global int ccl_global int int num_active_paths
ccl_gpu_kernel_postfix ccl_global float int int int int float bool int int ccl_global uint * num_active_pixels
ccl_gpu_kernel_postfix ccl_global const int const int const int terminated_states_offset
ccl_gpu_kernel_postfix ccl_global float int int int int float bool int offset
ccl_gpu_kernel_postfix ccl_global float int int int int float bool reset
clear internal cached data and reset random seed
ccl_gpu_kernel_postfix ccl_global float int int full_y
ccl_gpu_kernel_postfix ccl_global float int int int int ccl_global const float int int int int int int int int int int int int num_samples
@ KERNEL_FEATURE_MNEE
@ KERNEL_FEATURE_AO
@ KERNEL_FEATURE_NODE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_NUM
@ BVH_LAYOUT_OPTIX
DeviceKernel
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK
@ DEVICE_KERNEL_INTEGRATOR_RESET
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_NUM
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X
@ DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
@ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND
@ DEVICE_KERNEL_PREFIX_SUM
#define VLOG_INFO
Definition: log.h:77
#define VLOG_IS_ON(severity)
Definition: log.h:39
#define DCHECK_GE(a, b)
Definition: log.h:62
#define DCHECK_LE(a, b)
Definition: log.h:67
#define VLOG_DEVICE_STATS
Definition: log.h:83
#define LOG(severity)
Definition: log.h:36
PassMode
Definition: pass.h:19
static CCL_NAMESPACE_BEGIN size_t estimate_single_state_size()
#define min(a, b)
Definition: sort.c:35
unsigned __int64 uint64_t
Definition: stdint.h:90
string string_human_readable_size(size_t size)
Definition: string.cpp:229
int num_queued[DEVICE_KERNEL_INTEGRATOR_NUM]
Definition: state.h:90
ccl_global IntegratorQueueCounter * queue_counter
Definition: state.h:120
ccl_global int * next_shadow_path_index
Definition: state.h:126
ccl_global int * next_main_path_index
Definition: state.h:129
ccl_global int * sort_key_counter[DEVICE_KERNEL_INTEGRATOR_NUM]
Definition: state.h:123
uint sort_partition_divisor
Definition: state.h:132
uint max_shaders
KernelBake bake
uint volume_stack_size
uint kernel_features
float max
ccl_device_inline size_t divide_up(size_t x, size_t y)
Definition: util/types.h:51
uint64_t device_ptr
Definition: util/types.h:43