Blender  V3.3
cuda/queue.cpp
Go to the documentation of this file.
1 /* SPDX-License-Identifier: Apache-2.0
2  * Copyright 2011-2022 Blender Foundation */
3 
4 #ifdef WITH_CUDA
5 
6 # include "device/cuda/queue.h"
7 
10 # include "device/cuda/kernel.h"
11 
13 
14 /* CUDADeviceQueue */
15 
16 CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device)
17  : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr)
18 {
19  const CUDAContextScope scope(cuda_device_);
20  cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING));
21 }
22 
23 CUDADeviceQueue::~CUDADeviceQueue()
24 {
25  const CUDAContextScope scope(cuda_device_);
26  cuStreamDestroy(cuda_stream_);
27 }
28 
29 int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
30 {
31  const int max_num_threads = cuda_device_->get_num_multiprocessors() *
32  cuda_device_->get_max_num_threads_per_multiprocessor();
33  int num_states = max(max_num_threads, 65536) * 16;
34 
35  const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
36  if (factor_str) {
37  const float factor = (float)atof(factor_str);
38  if (factor != 0.0f) {
39  num_states = max((int)(num_states * factor), 1024);
40  }
41  else {
42  VLOG_DEVICE_STATS << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0";
43  }
44  }
45 
46  VLOG_DEVICE_STATS << "GPU queue concurrent states: " << num_states << ", using up to "
47  << string_human_readable_size(num_states * state_size);
48 
49  return num_states;
50 }
51 
52 int CUDADeviceQueue::num_concurrent_busy_states() const
53 {
54  const int max_num_threads = cuda_device_->get_num_multiprocessors() *
55  cuda_device_->get_max_num_threads_per_multiprocessor();
56 
57  if (max_num_threads == 0) {
58  return 65536;
59  }
60 
61  return 4 * max_num_threads;
62 }
63 
65 {
66  /* Synchronize all textures and memory copies before executing task. */
67  CUDAContextScope scope(cuda_device_);
68  cuda_device_->load_texture_info();
69  cuda_device_assert(cuda_device_, cuCtxSynchronize());
70 
71  debug_init_execution();
72 }
73 
74 bool CUDADeviceQueue::enqueue(DeviceKernel kernel,
75  const int work_size,
76  DeviceKernelArguments const &args)
77 {
78  if (cuda_device_->have_error()) {
79  return false;
80  }
81 
82  debug_enqueue(kernel, work_size);
83 
84  const CUDAContextScope scope(cuda_device_);
85  const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel);
86 
87  /* Compute kernel launch parameters. */
88  const int num_threads_per_block = cuda_kernel.num_threads_per_block;
89  const int num_blocks = divide_up(work_size, num_threads_per_block);
90 
91  int shared_mem_bytes = 0;
92 
93  switch (kernel) {
102  /* See parall_active_index.h for why this amount of shared memory is needed. */
103  shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
104  break;
105 
106  default:
107  break;
108  }
109 
110  /* Launch kernel. */
111  assert_success(cuLaunchKernel(cuda_kernel.function,
112  num_blocks,
113  1,
114  1,
115  num_threads_per_block,
116  1,
117  1,
118  shared_mem_bytes,
119  cuda_stream_,
120  const_cast<void **>(args.values),
121  0),
122  "enqueue");
123 
124  return !(cuda_device_->have_error());
125 }
126 
127 bool CUDADeviceQueue::synchronize()
128 {
129  if (cuda_device_->have_error()) {
130  return false;
131  }
132 
133  const CUDAContextScope scope(cuda_device_);
134  assert_success(cuStreamSynchronize(cuda_stream_), "synchronize");
135 
136  debug_synchronize();
137 
138  return !(cuda_device_->have_error());
139 }
140 
141 void CUDADeviceQueue::zero_to_device(device_memory &mem)
142 {
143  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
144 
145  if (mem.memory_size() == 0) {
146  return;
147  }
148 
149  /* Allocate on demand. */
150  if (mem.device_pointer == 0) {
151  cuda_device_->mem_alloc(mem);
152  }
153 
154  /* Zero memory on device. */
155  assert(mem.device_pointer != 0);
156 
157  const CUDAContextScope scope(cuda_device_);
158  assert_success(
159  cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_),
160  "zero_to_device");
161 }
162 
163 void CUDADeviceQueue::copy_to_device(device_memory &mem)
164 {
165  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
166 
167  if (mem.memory_size() == 0) {
168  return;
169  }
170 
171  /* Allocate on demand. */
172  if (mem.device_pointer == 0) {
173  cuda_device_->mem_alloc(mem);
174  }
175 
176  assert(mem.device_pointer != 0);
177  assert(mem.host_pointer != nullptr);
178 
179  /* Copy memory to device. */
180  const CUDAContextScope scope(cuda_device_);
181  assert_success(
182  cuMemcpyHtoDAsync(
183  (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_),
184  "copy_to_device");
185 }
186 
187 void CUDADeviceQueue::copy_from_device(device_memory &mem)
188 {
189  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
190 
191  if (mem.memory_size() == 0) {
192  return;
193  }
194 
195  assert(mem.device_pointer != 0);
196  assert(mem.host_pointer != nullptr);
197 
198  /* Copy memory from device. */
199  const CUDAContextScope scope(cuda_device_);
200  assert_success(
201  cuMemcpyDtoHAsync(
202  mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_),
203  "copy_from_device");
204 }
205 
206 void CUDADeviceQueue::assert_success(CUresult result, const char *operation)
207 {
208  if (result != CUDA_SUCCESS) {
209  const char *name = cuewErrorString(result);
210  cuda_device_->set_error(string_printf(
211  "%s in CUDA queue %s (%s)", name, operation, debug_active_kernels().c_str()));
212  }
213 }
214 
215 unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create()
216 {
217  return make_unique<CUDADeviceGraphicsInterop>(this);
218 }
219 
221 
222 #endif /* WITH_CUDA */
typedef float(TangentPoint)[2]
device_ptr device_pointer
#define CCL_NAMESPACE_END
Definition: cuda/compat.h:9
@ MEM_GLOBAL
@ MEM_TEXTURE
SyclQueue void void size_t num_bytes SyclQueue void const char void *memory_device_pointer KernelContext int kernel
static struct ImBuf * init_execution(const SeqRenderData *context, ImBuf *ibuf1, ImBuf *ibuf2, ImBuf *ibuf3)
Definition: effects.c:3519
ccl_gpu_kernel_postfix ccl_global const int ccl_global float const int work_size
int num_states
DeviceKernel
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
#define VLOG_DEVICE_STATS
Definition: log.h:83
string string_human_readable_size(size_t size)
Definition: string.cpp:229
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition: string.cpp:22
void * values[MAX_ARGS]
Definition: device/queue.h:35
float max
ccl_device_inline size_t divide_up(size_t x, size_t y)
Definition: util/types.h:51