Blender  V3.3
kernel/device/oneapi/kernel.cpp
Go to the documentation of this file.
1 /* SPDX-License-Identifier: Apache-2.0
2  * Copyright 2021-2022 Intel Corporation */
3 
4 #ifdef WITH_ONEAPI
5 
6 /* clang-format off */
7 # include "kernel.h"
8 # include <iostream>
9 # include <map>
10 # include <set>
11 
12 # include <CL/sycl.hpp>
13 
17 
19 /* clang-format on */
20 
21 static OneAPIErrorCallback s_error_cb = nullptr;
22 static void *s_error_user_ptr = nullptr;
23 
24 static std::vector<sycl::device> oneapi_available_devices();
25 
26 void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
27 {
28  s_error_cb = cb;
29  s_error_user_ptr = user_ptr;
30 }
31 
32 void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
33 {
34 # ifdef _DEBUG
35  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
36  sycl::info::device_type device_type =
37  queue->get_device().get_info<sycl::info::device::device_type>();
38  sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
39  (void)usm_type;
40  assert(usm_type == sycl::usm::alloc::device ||
41  ((device_type == sycl::info::device_type::host ||
42  device_type == sycl::info::device_type::is_cpu || allow_host) &&
43  usm_type == sycl::usm::alloc::host));
44 # endif
45 }
46 
47 bool oneapi_create_queue(SyclQueue *&external_queue, int device_index)
48 {
49  bool finished_correct = true;
50  try {
51  std::vector<sycl::device> devices = oneapi_available_devices();
52  if (device_index < 0 || device_index >= devices.size()) {
53  return false;
54  }
55  sycl::queue *created_queue = new sycl::queue(devices[device_index],
56  sycl::property::queue::in_order());
57  external_queue = reinterpret_cast<SyclQueue *>(created_queue);
58  }
59  catch (sycl::exception const &e) {
60  finished_correct = false;
61  if (s_error_cb) {
62  s_error_cb(e.what(), s_error_user_ptr);
63  }
64  }
65  return finished_correct;
66 }
67 
68 void oneapi_free_queue(SyclQueue *queue_)
69 {
70  assert(queue_);
71  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
72  delete queue;
73 }
74 
75 void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment)
76 {
77  assert(queue_);
78  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
79  return sycl::aligned_alloc_host(alignment, memory_size, *queue);
80 }
81 
82 void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size)
83 {
84  assert(queue_);
85  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
86  return sycl::malloc_device(memory_size, *queue);
87 }
88 
89 void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr)
90 {
91  assert(queue_);
92  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
93  oneapi_check_usm(queue_, usm_ptr, true);
94  sycl::free(usm_ptr, *queue);
95 }
96 
97 bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
98 {
99  assert(queue_);
100  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
101  oneapi_check_usm(queue_, dest, true);
102  oneapi_check_usm(queue_, src, true);
103  sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
104 # ifdef WITH_CYCLES_DEBUG
105  try {
106  /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
107  * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
108  */
109  mem_event.wait_and_throw();
110  return true;
111  }
112  catch (sycl::exception const &e) {
113  if (s_error_cb) {
114  s_error_cb(e.what(), s_error_user_ptr);
115  }
116  return false;
117  }
118 # else
119  sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
120  sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
121  bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
122  src_type == sycl::usm::alloc::device;
123  bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
124  src_type == sycl::usm::alloc::unknown;
125  /* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
126  * may not wait until the end of the transfer before using the memory.
127  */
128  if (from_device_to_host || host_or_device_memop_with_offset)
129  mem_event.wait();
130  return true;
131 # endif
132 }
133 
134 bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes)
135 {
136  assert(queue_);
137  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
138  oneapi_check_usm(queue_, usm_ptr, true);
139  sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
140 # ifdef WITH_CYCLES_DEBUG
141  try {
142  /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
143  * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
144  */
145  mem_event.wait_and_throw();
146  return true;
147  }
148  catch (sycl::exception const &e) {
149  if (s_error_cb) {
150  s_error_cb(e.what(), s_error_user_ptr);
151  }
152  return false;
153  }
154 # else
155  (void)mem_event;
156  return true;
157 # endif
158 }
159 
160 bool oneapi_queue_synchronize(SyclQueue *queue_)
161 {
162  assert(queue_);
163  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
164  try {
165  queue->wait_and_throw();
166  return true;
167  }
168  catch (sycl::exception const &e) {
169  if (s_error_cb) {
170  s_error_cb(e.what(), s_error_user_ptr);
171  }
172  return false;
173  }
174 }
175 
176 /* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and
177  * also trigger runtime compilation of all existing oneAPI kernels */
178 bool oneapi_run_test_kernel(SyclQueue *queue_)
179 {
180  assert(queue_);
181  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
182  size_t N = 8;
183  sycl::buffer<float, 1> A(N);
184  sycl::buffer<float, 1> B(N);
185 
186  {
187  sycl::host_accessor A_host_acc(A, sycl::write_only);
188  for (size_t i = (size_t)0; i < N; i++)
189  A_host_acc[i] = rand() % 32;
190  }
191 
192  try {
193  queue->submit([&](sycl::handler &cgh) {
194  sycl::accessor A_acc(A, cgh, sycl::read_only);
195  sycl::accessor B_acc(B, cgh, sycl::write_only, sycl::no_init);
196 
197  cgh.parallel_for(N, [=](sycl::id<1> idx) { B_acc[idx] = A_acc[idx] + idx.get(0); });
198  });
199  queue->wait_and_throw();
200 
201  sycl::host_accessor A_host_acc(A, sycl::read_only);
202  sycl::host_accessor B_host_acc(B, sycl::read_only);
203 
204  for (size_t i = (size_t)0; i < N; i++) {
205  float result = A_host_acc[i] + B_host_acc[i];
206  (void)result;
207  }
208  }
209  catch (sycl::exception const &e) {
210  if (s_error_cb) {
211  s_error_cb(e.what(), s_error_user_ptr);
212  }
213  return false;
214  }
215 
216  return true;
217 }
218 
219 bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
220 {
221  kernel_global_size = sizeof(KernelGlobalsGPU);
222 
223  return true;
224 }
225 
226 void oneapi_set_global_memory(SyclQueue *queue_,
227  void *kernel_globals,
228  const char *memory_name,
229  void *memory_device_pointer)
230 {
231  assert(queue_);
232  assert(kernel_globals);
233  assert(memory_name);
234  assert(memory_device_pointer);
236  oneapi_check_usm(queue_, memory_device_pointer);
237  oneapi_check_usm(queue_, kernel_globals, true);
238 
239  std::string matched_name(memory_name);
240 
241 /* This macro will change global ptr of KernelGlobals via name matching. */
242 # define KERNEL_DATA_ARRAY(type, name) \
243  else if (#name == matched_name) \
244  { \
245  globals->__##name = (type *)memory_device_pointer; \
246  return; \
247  }
248  if (false) {
249  }
250  else if ("integrator_state" == matched_name) {
251  globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
252  return;
253  }
255 # include "kernel/data_arrays.h"
256  else
257  {
258  std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
259  << std::endl;
260  assert(false);
261  }
262 # undef KERNEL_DATA_ARRAY
263 }
264 
265 /* TODO: Move device information to OneapiDevice initialized on creation and use it. */
266 /* TODO: Move below function to oneapi/queue.cpp. */
267 size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
268  const DeviceKernel kernel,
269  const size_t kernel_global_size)
270 {
271  assert(queue_);
272  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
273  (void)kernel_global_size;
274  const static size_t preferred_work_group_size_intersect_shading = 32;
275  const static size_t preferred_work_group_size_technical = 1024;
276 
277  size_t preferred_work_group_size = 0;
278  switch (kernel) {
292  preferred_work_group_size = preferred_work_group_size_intersect_shading;
293  break;
294 
307  preferred_work_group_size = preferred_work_group_size_technical;
308  break;
309 
310  default:
311  preferred_work_group_size = 512;
312  }
313 
314  const size_t limit_work_group_size =
315  queue->get_device().get_info<sycl::info::device::max_work_group_size>();
316  return std::min(limit_work_group_size, preferred_work_group_size);
317 }
318 
319 bool oneapi_enqueue_kernel(KernelContext *kernel_context,
320  int kernel,
321  size_t global_size,
322  void **args)
323 {
324  bool success = true;
325  ::DeviceKernel device_kernel = (::DeviceKernel)kernel;
326  KernelGlobalsGPU *kg = (KernelGlobalsGPU *)kernel_context->kernel_globals;
327  sycl::queue *queue = reinterpret_cast<sycl::queue *>(kernel_context->queue);
328  assert(queue);
329  if (!queue) {
330  return false;
331  }
332 
333  size_t local_size = oneapi_kernel_preferred_local_size(
334  kernel_context->queue, device_kernel, global_size);
335  assert(global_size % local_size == 0);
336 
337  /* Local size for DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY needs to be enforced so we
338  * overwrite it outside of oneapi_kernel_preferred_local_size. */
339  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY) {
341  }
342 
343  /* Kernels listed below need a specific number of work groups. */
344  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY ||
351  int num_states = *((int *)(args[0]));
352  /* Round up to the next work-group. */
353  size_t groups_count = (num_states + local_size - 1) / local_size;
354  /* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
355  * we extend work size to fit uniformity requirements. */
356  global_size = groups_count * local_size;
357 
358 # ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
359  if (queue->get_device().is_host()) {
360  global_size = 1;
361  local_size = 1;
362  }
363 # endif
364  }
365 
366  /* Let the compiler throw an error if there are any kernels missing in this implementation. */
367 # if defined(_WIN32)
368 # pragma warning(error : 4062)
369 # elif defined(__GNUC__)
370 # pragma GCC diagnostic push
371 # pragma GCC diagnostic error "-Wswitch"
372 # endif
373 
374  try {
375  queue->submit([&](sycl::handler &cgh) {
376  switch (device_kernel) {
378  oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_reset);
379  break;
380  }
382  oneapi_call(
383  kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_init_from_camera);
384  break;
385  }
387  oneapi_call(
388  kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_init_from_bake);
389  break;
390  }
392  oneapi_call(
393  kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_intersect_closest);
394  break;
395  }
397  oneapi_call(
398  kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_intersect_shadow);
399  break;
400  }
402  oneapi_call(kg,
403  cgh,
404  global_size,
405  local_size,
406  args,
407  oneapi_kernel_integrator_intersect_subsurface);
408  break;
409  }
411  oneapi_call(kg,
412  cgh,
413  global_size,
414  local_size,
415  args,
416  oneapi_kernel_integrator_intersect_volume_stack);
417  break;
418  }
420  oneapi_call(
421  kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_background);
422  break;
423  }
425  oneapi_call(
426  kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_light);
427  break;
428  }
430  oneapi_call(
431  kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_shadow);
432  break;
433  }
435  oneapi_call(
436  kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface);
437  break;
438  }
440  oneapi_call(kg,
441  cgh,
442  global_size,
443  local_size,
444  args,
445  oneapi_kernel_integrator_shade_surface_raytrace);
446  break;
447  }
449  oneapi_call(
450  kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface_mnee);
451  break;
452  }
454  oneapi_call(
455  kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_volume);
456  break;
457  }
459  oneapi_call(
460  kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_queued_paths_array);
461  break;
462  }
464  oneapi_call(kg,
465  cgh,
466  global_size,
467  local_size,
468  args,
469  oneapi_kernel_integrator_queued_shadow_paths_array);
470  break;
471  }
473  oneapi_call(
474  kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_active_paths_array);
475  break;
476  }
478  oneapi_call(kg,
479  cgh,
480  global_size,
481  local_size,
482  args,
483  oneapi_kernel_integrator_terminated_paths_array);
484  break;
485  }
487  oneapi_call(kg,
488  cgh,
489  global_size,
490  local_size,
491  args,
492  oneapi_kernel_integrator_terminated_shadow_paths_array);
493  break;
494  }
496  oneapi_call(
497  kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sorted_paths_array);
498  break;
499  }
501  oneapi_call(kg,
502  cgh,
503  global_size,
504  local_size,
505  args,
506  oneapi_kernel_integrator_compact_paths_array);
507  break;
508  }
510  oneapi_call(kg,
511  cgh,
512  global_size,
513  local_size,
514  args,
515  oneapi_kernel_integrator_compact_shadow_paths_array);
516  break;
517  }
519  oneapi_call(kg,
520  cgh,
521  global_size,
522  local_size,
523  args,
524  oneapi_kernel_adaptive_sampling_convergence_check);
525  break;
526  }
528  oneapi_call(
529  kg, cgh, global_size, local_size, args, oneapi_kernel_adaptive_sampling_filter_x);
530  break;
531  }
533  oneapi_call(
534  kg, cgh, global_size, local_size, args, oneapi_kernel_adaptive_sampling_filter_y);
535  break;
536  }
538  oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_shader_eval_displace);
539  break;
540  }
542  oneapi_call(
543  kg, cgh, global_size, local_size, args, oneapi_kernel_shader_eval_background);
544  break;
545  }
547  oneapi_call(kg,
548  cgh,
549  global_size,
550  local_size,
551  args,
552  oneapi_kernel_shader_eval_curve_shadow_transparency);
553  break;
554  }
556  oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_prefix_sum);
557  break;
558  }
559 
560  /* clang-format off */
561  # define DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT, variant) \
562  case DEVICE_KERNEL_FILM_CONVERT_##VARIANT: { \
563  oneapi_call(kg, cgh, \
564  global_size, \
565  local_size, \
566  args, \
567  oneapi_kernel_film_convert_##variant); \
568  break; \
569  }
570 
571 # define DEVICE_KERNEL_FILM_CONVERT(variant, VARIANT) \
572  DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT, variant) \
573  DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT##_HALF_RGBA, variant##_half_rgba)
574 
575  DEVICE_KERNEL_FILM_CONVERT(depth, DEPTH);
576  DEVICE_KERNEL_FILM_CONVERT(mist, MIST);
577  DEVICE_KERNEL_FILM_CONVERT(sample_count, SAMPLE_COUNT);
578  DEVICE_KERNEL_FILM_CONVERT(float, FLOAT);
579  DEVICE_KERNEL_FILM_CONVERT(light_path, LIGHT_PATH);
580  DEVICE_KERNEL_FILM_CONVERT(float3, FLOAT3);
581  DEVICE_KERNEL_FILM_CONVERT(motion, MOTION);
582  DEVICE_KERNEL_FILM_CONVERT(cryptomatte, CRYPTOMATTE);
583  DEVICE_KERNEL_FILM_CONVERT(shadow_catcher, SHADOW_CATCHER);
584  DEVICE_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow,
585  SHADOW_CATCHER_MATTE_WITH_SHADOW);
586  DEVICE_KERNEL_FILM_CONVERT(combined, COMBINED);
587  DEVICE_KERNEL_FILM_CONVERT(float4, FLOAT4);
588 
589 # undef DEVICE_KERNEL_FILM_CONVERT
590 # undef DEVICE_KERNEL_FILM_CONVERT_PARTIAL
591  /* clang-format on */
592 
594  oneapi_call(
595  kg, cgh, global_size, local_size, args, oneapi_kernel_filter_guiding_preprocess);
596  break;
597  }
599  oneapi_call(kg,
600  cgh,
601  global_size,
602  local_size,
603  args,
604  oneapi_kernel_filter_guiding_set_fake_albedo);
605  break;
606  }
608  oneapi_call(
609  kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_preprocess);
610  break;
611  }
613  oneapi_call(
614  kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_postprocess);
615  break;
616  }
618  oneapi_call(
619  kg, cgh, global_size, local_size, args, oneapi_kernel_cryptomatte_postprocess);
620  break;
621  }
623  oneapi_call(
624  kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_compact_states);
625  break;
626  }
628  oneapi_call(kg,
629  cgh,
630  global_size,
631  local_size,
632  args,
633  oneapi_kernel_integrator_compact_shadow_states);
634  break;
635  }
637  oneapi_call(kg,
638  cgh,
639  global_size,
640  local_size,
641  args,
642  oneapi_kernel_integrator_shadow_catcher_count_possible_splits);
643  break;
644  }
645  /* Unsupported kernels */
646  case DEVICE_KERNEL_NUM:
648  kernel_assert(0);
649  break;
650  }
651  });
652  }
653  catch (sycl::exception const &e) {
654  if (s_error_cb) {
655  s_error_cb(e.what(), s_error_user_ptr);
656  success = false;
657  }
658  }
659 
660 # if defined(_WIN32)
661 # pragma warning(default : 4062)
662 # elif defined(__GNUC__)
663 # pragma GCC diagnostic pop
664 # endif
665  return success;
666 }
667 
668 /* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
669  * since Windows driver 101.3268. */
670 /* The same min compute-runtime version is currently required across Windows and Linux.
671  * For Windows driver 101.3430, compute-runtime version is 23904. */
672 static const int lowest_supported_driver_version_win = 1013430;
673 static const int lowest_supported_driver_version_neo = 23904;
674 
675 static int parse_driver_build_version(const sycl::device &device)
676 {
677  const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
678  int driver_build_version = 0;
679 
680  size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
681  if (second_dot_position == std::string::npos) {
682  std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
683  << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
684  << " xx.xx.xxx.xxxx (Windows) for device \""
685  << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
686  }
687  else {
688  try {
689  size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
690  if (third_dot_position != std::string::npos) {
691  const std::string &third_number_substr = driver_version.substr(
692  second_dot_position + 1, third_dot_position - second_dot_position - 1);
693  const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
694  if (third_number_substr.length() == 3 && forth_number_substr.length() == 4)
695  driver_build_version = std::stoi(third_number_substr) * 10000 +
696  std::stoi(forth_number_substr);
697  }
698  else {
699  const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
700  driver_build_version = std::stoi(third_number_substr);
701  }
702  }
703  catch (std::invalid_argument &e) {
704  std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
705  << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
706  << " xx.xx.xxx.xxxx (Windows) for device \""
707  << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
708  }
709  }
710 
711  return driver_build_version;
712 }
713 
714 static std::vector<sycl::device> oneapi_available_devices()
715 {
716  bool allow_all_devices = false;
717  if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
718  allow_all_devices = true;
719 
720  /* Host device is useful only for debugging at the moment
721  * so we hide this device with default build settings. */
722 # ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
723  bool allow_host = true;
724 # else
725  bool allow_host = false;
726 # endif
727 
728  const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
729 
730  std::vector<sycl::device> available_devices;
731  for (const sycl::platform &platform : oneapi_platforms) {
732  /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
733  */
734  if (platform.get_backend() == sycl::backend::opencl) {
735  continue;
736  }
737 
738  const std::vector<sycl::device> &oneapi_devices =
739  (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
740  platform.get_devices(sycl::info::device_type::gpu);
741 
742  for (const sycl::device &device : oneapi_devices) {
743  if (allow_all_devices) {
744  /* still filter out host device if build doesn't support it. */
745  if (allow_host || !device.is_host()) {
746  available_devices.push_back(device);
747  }
748  }
749  else {
750  bool filter_out = false;
751 
752  /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
753  * assuming they have either more than 96 Execution Units or not 7 threads per EU.
754  * Official support can be broaden to older and smaller GPUs once ready. */
755  if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
756  /* Filtered-out defaults in-case these values aren't available through too old L0
757  * runtime. */
758  int number_of_eus = 96;
759  int threads_per_eu = 7;
760  if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
761  number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
762  }
763  if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
764  threads_per_eu =
765  device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
766  }
767  /* This filters out all Level-Zero supported GPUs from older generation than Arc. */
768  if (number_of_eus <= 96 && threads_per_eu == 7) {
769  filter_out = true;
770  }
771  /* if not already filtered out, check driver version. */
772  if (!filter_out) {
773  int driver_build_version = parse_driver_build_version(device);
774  if ((driver_build_version > 100000 &&
775  driver_build_version < lowest_supported_driver_version_win) ||
776  driver_build_version < lowest_supported_driver_version_neo) {
777  filter_out = true;
778  }
779  }
780  }
781  else if (!allow_host && device.is_host()) {
782  filter_out = true;
783  }
784  else if (!allow_all_devices) {
785  filter_out = true;
786  }
787 
788  if (!filter_out) {
789  available_devices.push_back(device);
790  }
791  }
792  }
793  }
794 
795  return available_devices;
796 }
797 
798 char *oneapi_device_capabilities()
799 {
800  std::stringstream capabilities;
801 
802  const std::vector<sycl::device> &oneapi_devices = oneapi_available_devices();
803  for (const sycl::device &device : oneapi_devices) {
804  const std::string &name = device.get_info<sycl::info::device::name>();
805 
806  capabilities << std::string("\t") << name << "\n";
807 # define WRITE_ATTR(attribute_name, attribute_variable) \
808  capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
809  << "\n";
810 # define GET_NUM_ATTR(attribute) \
811  { \
812  size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \
813  capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \
814  }
815 
816  GET_NUM_ATTR(vendor_id)
817  GET_NUM_ATTR(max_compute_units)
818  GET_NUM_ATTR(max_work_item_dimensions)
819 
820  sycl::id<3> max_work_item_sizes =
821  device.get_info<sycl::info::device::max_work_item_sizes<3>>();
822  WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0)))
823  WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1)))
824  WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2)))
825 
826  GET_NUM_ATTR(max_work_group_size)
827  GET_NUM_ATTR(max_num_sub_groups)
828  GET_NUM_ATTR(sub_group_independent_forward_progress)
829 
830  GET_NUM_ATTR(preferred_vector_width_char)
831  GET_NUM_ATTR(preferred_vector_width_short)
832  GET_NUM_ATTR(preferred_vector_width_int)
833  GET_NUM_ATTR(preferred_vector_width_long)
834  GET_NUM_ATTR(preferred_vector_width_float)
835  GET_NUM_ATTR(preferred_vector_width_double)
836  GET_NUM_ATTR(preferred_vector_width_half)
837 
838  GET_NUM_ATTR(native_vector_width_char)
839  GET_NUM_ATTR(native_vector_width_short)
840  GET_NUM_ATTR(native_vector_width_int)
841  GET_NUM_ATTR(native_vector_width_long)
842  GET_NUM_ATTR(native_vector_width_float)
843  GET_NUM_ATTR(native_vector_width_double)
844  GET_NUM_ATTR(native_vector_width_half)
845 
846  size_t max_clock_frequency =
847  (size_t)(device.is_host() ? (size_t)0 :
848  device.get_info<sycl::info::device::max_clock_frequency>());
849  WRITE_ATTR("max_clock_frequency", max_clock_frequency)
850 
851  GET_NUM_ATTR(address_bits)
852  GET_NUM_ATTR(max_mem_alloc_size)
853 
854  /* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't
855  * supported so we always return false, even if device supports HW texture usage acceleration.
856  */
857  bool image_support = false;
858  WRITE_ATTR("image_support", (size_t)image_support)
859 
860  GET_NUM_ATTR(max_parameter_size)
861  GET_NUM_ATTR(mem_base_addr_align)
862  GET_NUM_ATTR(global_mem_size)
863  GET_NUM_ATTR(local_mem_size)
864  GET_NUM_ATTR(error_correction_support)
865  GET_NUM_ATTR(profiling_timer_resolution)
866  GET_NUM_ATTR(is_available)
867 
868 # undef GET_NUM_ATTR
869 # undef WRITE_ATTR
870  capabilities << "\n";
871  }
872 
873  return ::strdup(capabilities.str().c_str());
874 }
875 
876 void oneapi_free(void *p)
877 {
878  if (p) {
879  ::free(p);
880  }
881 }
882 
883 void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
884 {
885  int num = 0;
886  std::vector<sycl::device> devices = oneapi_available_devices();
887  for (sycl::device &device : devices) {
888  const std::string &platform_name =
889  device.get_platform().get_info<sycl::info::platform::name>();
890  std::string name = device.get_info<sycl::info::device::name>();
891  std::string id = "ONEAPI_" + platform_name + "_" + name;
892  if (device.has(sycl::aspect::ext_intel_pci_address)) {
893  id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
894  }
895  (cb)(id.c_str(), name.c_str(), num, user_ptr);
896  num++;
897  }
898 }
899 
900 size_t oneapi_get_memcapacity(SyclQueue *queue)
901 {
902  return reinterpret_cast<sycl::queue *>(queue)
903  ->get_device()
904  .get_info<sycl::info::device::global_mem_size>();
905 }
906 
907 int oneapi_get_num_multiprocessors(SyclQueue *queue)
908 {
909  const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
910  if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
911  return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
912  }
913  else
914  return 0;
915 }
916 
917 int oneapi_get_max_num_threads_per_multiprocessor(SyclQueue *queue)
918 {
919  const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
920  if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
921  device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
922  return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() *
923  device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
924  }
925  else
926  return 0;
927 }
928 
929 #endif /* WITH_ONEAPI */
void BLI_kdtree_nd_() free(KDTree *tree)
Definition: kdtree_impl.h:102
float float4[4]
__forceinline bool all(const avxb &b)
Definition: avxb.h:201
ATTR_WARN_UNUSED_RESULT const BMVert const BMEdge * e
#define A
static PointerRNA * get_pointer_type(ButsContextPath *path, StructRNA *type)
#define kernel_assert(cond)
Definition: cpu/compat.h:34
#define KERNEL_DATA_ARRAY(type, name)
Definition: data_arrays.h:5
struct KernelGlobalsGPU KernelGlobalsGPU
SyclQueue * queue
SyclQueue void void size_t num_bytes SyclQueue void const char void *memory_device_pointer KernelContext int kernel
SyclQueue void void size_t num_bytes SyclQueue void const char * memory_name
SyclQueue void void * src
SyclQueue void void size_t num_bytes void
SyclQueue void * dest
SyclQueue void void size_t num_bytes SyclQueue void const char void *memory_device_pointer KernelContext int size_t global_size
SyclQueue void void size_t num_bytes SyclQueue void * kernel_globals
int num_states
DeviceKernel
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK
@ DEVICE_KERNEL_INTEGRATOR_RESET
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT
@ DEVICE_KERNEL_FILTER_COLOR_PREPROCESS
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_SHADER_EVAL_DISPLACE
@ DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO
@ DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK
@ DEVICE_KERNEL_SHADER_EVAL_BACKGROUND
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL
@ DEVICE_KERNEL_NUM
@ DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X
@ DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE
@ DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
@ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND
@ DEVICE_KERNEL_PREFIX_SUM
#define N
#define B
Vector< CPUDevice > devices
list of all CPUDevices. for every hardware thread an instance of CPUDevice is created
struct blender::compositor::@179::@182 opencl
#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE
@ FLOAT4
@ FLOAT3
@ FLOAT
#define min(a, b)
Definition: sort.c:35
IntegratorStateGPU * integrator_state