Blender  V3.3
device/queue.h
Go to the documentation of this file.
1 /* SPDX-License-Identifier: Apache-2.0
2  * Copyright 2011-2022 Blender Foundation */
3 
4 #pragma once
5 
6 #include "device/kernel.h"
7 
9 #include "util/debug.h"
10 #include "util/log.h"
11 #include "util/map.h"
12 #include "util/string.h"
13 #include "util/unique_ptr.h"
14 
16 
17 class Device;
18 class device_memory;
19 
20 struct KernelWorkTile;
21 
22 /* Container for device kernel arguments with type correctness ensured by API. */
24 
25  enum Type {
31  };
32 
33  static const int MAX_ARGS = 18;
35  void *values[MAX_ARGS];
36  size_t sizes[MAX_ARGS];
37  size_t count = 0;
38 
40  {
41  }
42 
43  template<class T> DeviceKernelArguments(const T *arg)
44  {
45  add(arg);
46  }
47 
48  template<class T, class... Args> DeviceKernelArguments(const T *first, Args... args)
49  {
50  add(first);
51  add(args...);
52  }
53 
54  void add(const KernelFilmConvert *value)
55  {
56  add(KERNEL_FILM_CONVERT, value, sizeof(KernelFilmConvert));
57  }
58  void add(const device_ptr *value)
59  {
60  add(POINTER, value, sizeof(device_ptr));
61  }
62  void add(const int32_t *value)
63  {
64  add(INT32, value, sizeof(int32_t));
65  }
66  void add(const float *value)
67  {
68  add(FLOAT32, value, sizeof(float));
69  }
70  void add(const bool *value)
71  {
72  add(BOOLEAN, value, 4);
73  }
74  void add(const Type type, const void *value, size_t size)
75  {
76  assert(count < MAX_ARGS);
77 
78  types[count] = type;
79  values[count] = (void *)value;
80  sizes[count] = size;
81  count++;
82  }
83  template<typename T, typename... Args> void add(const T *first, Args... args)
84  {
85  add(first);
86  add(args...);
87  }
88 };
89 
90 /* Abstraction of a command queue for a device.
91  * Provides API to schedule kernel execution in a specific queue with minimal possible overhead
92  * from driver side.
93  *
94  * This class encapsulates all properties needed for commands execution. */
95 class DeviceQueue {
96  public:
97  virtual ~DeviceQueue();
98 
99  /* Number of concurrent states to process for integrator,
100  * based on number of cores and/or available memory. */
101  virtual int num_concurrent_states(const size_t state_size) const = 0;
102 
103  /* Number of states which keeps the device occupied with work without losing performance.
104  * The renderer will add more work (when available) when number of active paths falls below this
105  * value. */
106  virtual int num_concurrent_busy_states() const = 0;
107 
108  /* Number of elements in a partition of sorted shaders, that improves memory locality of
109  * integrator state fetch at the cost of decreased coherence for shader kernel execution. */
110  virtual int num_sort_partition_elements() const
111  {
112  return 65536;
113  }
114 
115  /* Initialize execution of kernels on this queue.
116  *
117  * Will, for example, load all data required by the kernels from Device to global or path state.
118  *
119  * Use this method after device synchronization has finished before enqueueing any kernels. */
120  virtual void init_execution() = 0;
121 
122  /* Enqueue kernel execution.
123  *
124  * Execute the kernel work_size times on the device.
125  * Supported arguments types:
126  * - int: pass pointer to the int
127  * - device memory: pass pointer to device_memory.device_pointer
128  * Return false if there was an error executing this or a previous kernel. */
130  const int work_size,
131  DeviceKernelArguments const &args) = 0;
132 
133  /* Wait unit all enqueued kernels have finished execution.
134  * Return false if there was an error executing any of the enqueued kernels. */
135  virtual bool synchronize() = 0;
136 
137  /* Copy memory to/from device as part of the command queue, to ensure
138  * operations are done in order without having to synchronize. */
139  virtual void zero_to_device(device_memory &mem) = 0;
140  virtual void copy_to_device(device_memory &mem) = 0;
141  virtual void copy_from_device(device_memory &mem) = 0;
142 
143  /* Graphics resources interoperability.
144  *
145  * The interoperability comes here by the meaning that the device is capable of computing result
146  * directly into an OpenGL (or other graphics library) buffer. */
147 
148  /* Create graphics interoperability context which will be taking care of mapping graphics
149  * resource as a buffer writable by kernels of this device. */
150  virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create()
151  {
152  LOG(FATAL) << "Request of GPU interop of a device which does not support it.";
153  return nullptr;
154  }
155 
156  /* Device this queue has been created for. */
158 
159  protected:
160  /* Hide construction so that allocation via `Device` API is enforced. */
161  explicit DeviceQueue(Device *device);
162 
163  /* Implementations call these from the corresponding methods to generate debugging logs. */
164  void debug_init_execution();
165  void debug_enqueue(DeviceKernel kernel, const int work_size);
166  void debug_synchronize();
167  string debug_active_kernels();
168 
169  /* Combination of kernels enqueued together sync last synchronize. */
171  /* Time of synchronize call. */
173  /* Accumulated execution time for combinations of kernels launched together. */
174  map<DeviceKernelMask, double> stats_kernel_time_;
175 };
176 
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum type
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition: btDbvt.cpp:52
void debug_enqueue(DeviceKernel kernel, const int work_size)
Definition: queue.cpp:53
double last_sync_time_
Definition: device/queue.h:172
virtual int num_sort_partition_elements() const
Definition: device/queue.h:110
DeviceKernelMask last_kernels_enqueued_
Definition: device/queue.h:170
virtual void copy_from_device(device_memory &mem)=0
void debug_synchronize()
Definition: queue.cpp:63
virtual int num_concurrent_states(const size_t state_size) const =0
Device * device
Definition: device/queue.h:157
virtual void init_execution()=0
virtual void copy_to_device(device_memory &mem)=0
map< DeviceKernelMask, double > stats_kernel_time_
Definition: device/queue.h:174
virtual unique_ptr< DeviceGraphicsInterop > graphics_interop_create()
Definition: device/queue.h:150
virtual int num_concurrent_busy_states() const =0
string debug_active_kernels()
Definition: queue.cpp:78
virtual ~DeviceQueue()
Definition: queue.cpp:20
virtual bool synchronize()=0
virtual bool enqueue(DeviceKernel kernel, const int work_size, DeviceKernelArguments const &args)=0
void debug_init_execution()
Definition: queue.cpp:44
DeviceQueue(Device *device)
Definition: queue.cpp:14
virtual void zero_to_device(device_memory &mem)=0
#define CCL_NAMESPACE_END
Definition: cuda/compat.h:9
uint64_t DeviceKernelMask
Definition: device/kernel.h:17
SyclQueue void void size_t num_bytes SyclQueue void const char void *memory_device_pointer KernelContext int kernel
ccl_gpu_kernel_postfix ccl_global const int ccl_global float const int work_size
DeviceKernel
#define LOG(severity)
Definition: log.h:36
#define T
signed int int32_t
Definition: stdint.h:77
static const int MAX_ARGS
Definition: device/queue.h:33
void add(const device_ptr *value)
Definition: device/queue.h:58
DeviceKernelArguments(const T *arg)
Definition: device/queue.h:43
void add(const T *first, Args... args)
Definition: device/queue.h:83
void add(const Type type, const void *value, size_t size)
Definition: device/queue.h:74
void add(const bool *value)
Definition: device/queue.h:70
void add(const float *value)
Definition: device/queue.h:66
void * values[MAX_ARGS]
Definition: device/queue.h:35
void add(const int32_t *value)
Definition: device/queue.h:62
DeviceKernelArguments(const T *first, Args... args)
Definition: device/queue.h:48
size_t sizes[MAX_ARGS]
Definition: device/queue.h:36
void add(const KernelFilmConvert *value)
Definition: device/queue.h:54
Type types[MAX_ARGS]
Definition: device/queue.h:34
uint64_t device_ptr
Definition: util/types.h:43