Blender  V3.3
hip/queue.cpp
Go to the documentation of this file.
1 /* SPDX-License-Identifier: Apache-2.0
2  * Copyright 2011-2022 Blender Foundation */
3 
4 #ifdef WITH_HIP
5 
6 # include "device/hip/queue.h"
7 
8 # include "device/hip/device_impl.h"
10 # include "device/hip/kernel.h"
11 
13 
14 /* HIPDeviceQueue */
15 
16 HIPDeviceQueue::HIPDeviceQueue(HIPDevice *device)
17  : DeviceQueue(device), hip_device_(device), hip_stream_(nullptr)
18 {
19  const HIPContextScope scope(hip_device_);
20  hip_device_assert(hip_device_, hipStreamCreateWithFlags(&hip_stream_, hipStreamNonBlocking));
21 }
22 
23 HIPDeviceQueue::~HIPDeviceQueue()
24 {
25  const HIPContextScope scope(hip_device_);
26  hipStreamDestroy(hip_stream_);
27 }
28 
29 int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const
30 {
31  const int max_num_threads = hip_device_->get_num_multiprocessors() *
32  hip_device_->get_max_num_threads_per_multiprocessor();
33  int num_states = ((max_num_threads == 0) ? 65536 : max_num_threads) * 16;
34 
35  const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
36  if (factor_str) {
37  const float factor = (float)atof(factor_str);
38  if (factor != 0.0f) {
39  num_states = max((int)(num_states * factor), 1024);
40  }
41  else {
42  VLOG_DEVICE_STATS << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0";
43  }
44  }
45 
46  VLOG_DEVICE_STATS << "GPU queue concurrent states: " << num_states << ", using up to "
47  << string_human_readable_size(num_states * state_size);
48 
49  return num_states;
50 }
51 
52 int HIPDeviceQueue::num_concurrent_busy_states() const
53 {
54  const int max_num_threads = hip_device_->get_num_multiprocessors() *
55  hip_device_->get_max_num_threads_per_multiprocessor();
56 
57  if (max_num_threads == 0) {
58  return 65536;
59  }
60 
61  return 4 * max_num_threads;
62 }
63 
65 {
66  /* Synchronize all textures and memory copies before executing task. */
67  HIPContextScope scope(hip_device_);
68  hip_device_->load_texture_info();
69  hip_device_assert(hip_device_, hipDeviceSynchronize());
70 
71  debug_init_execution();
72 }
73 
74 bool HIPDeviceQueue::enqueue(DeviceKernel kernel,
75  const int work_size,
76  DeviceKernelArguments const &args)
77 {
78  if (hip_device_->have_error()) {
79  return false;
80  }
81 
82  debug_enqueue(kernel, work_size);
83 
84  const HIPContextScope scope(hip_device_);
85  const HIPDeviceKernel &hip_kernel = hip_device_->kernels.get(kernel);
86 
87  /* Compute kernel launch parameters. */
88  const int num_threads_per_block = hip_kernel.num_threads_per_block;
89  const int num_blocks = divide_up(work_size, num_threads_per_block);
90 
91  int shared_mem_bytes = 0;
92 
93  switch (kernel) {
102  /* See parall_active_index.h for why this amount of shared memory is needed. */
103  shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
104  break;
105  default:
106  break;
107  }
108 
109  /* Launch kernel. */
110  assert_success(hipModuleLaunchKernel(hip_kernel.function,
111  num_blocks,
112  1,
113  1,
114  num_threads_per_block,
115  1,
116  1,
117  shared_mem_bytes,
118  hip_stream_,
119  const_cast<void **>(args.values),
120  0),
121  "enqueue");
122 
123  return !(hip_device_->have_error());
124 }
125 
126 bool HIPDeviceQueue::synchronize()
127 {
128  if (hip_device_->have_error()) {
129  return false;
130  }
131 
132  const HIPContextScope scope(hip_device_);
133  assert_success(hipStreamSynchronize(hip_stream_), "synchronize");
134  debug_synchronize();
135 
136  return !(hip_device_->have_error());
137 }
138 
139 void HIPDeviceQueue::zero_to_device(device_memory &mem)
140 {
141  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
142 
143  if (mem.memory_size() == 0) {
144  return;
145  }
146 
147  /* Allocate on demand. */
148  if (mem.device_pointer == 0) {
149  hip_device_->mem_alloc(mem);
150  }
151 
152  /* Zero memory on device. */
153  assert(mem.device_pointer != 0);
154 
155  const HIPContextScope scope(hip_device_);
156  assert_success(
157  hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_),
158  "zero_to_device");
159 }
160 
161 void HIPDeviceQueue::copy_to_device(device_memory &mem)
162 {
163  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
164 
165  if (mem.memory_size() == 0) {
166  return;
167  }
168 
169  /* Allocate on demand. */
170  if (mem.device_pointer == 0) {
171  hip_device_->mem_alloc(mem);
172  }
173 
174  assert(mem.device_pointer != 0);
175  assert(mem.host_pointer != nullptr);
176 
177  /* Copy memory to device. */
178  const HIPContextScope scope(hip_device_);
179  assert_success(
180  hipMemcpyHtoDAsync(
181  (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_),
182  "copy_to_device");
183 }
184 
185 void HIPDeviceQueue::copy_from_device(device_memory &mem)
186 {
187  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
188 
189  if (mem.memory_size() == 0) {
190  return;
191  }
192 
193  assert(mem.device_pointer != 0);
194  assert(mem.host_pointer != nullptr);
195 
196  /* Copy memory from device. */
197  const HIPContextScope scope(hip_device_);
198  assert_success(
199  hipMemcpyDtoHAsync(
200  mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_),
201  "copy_from_device");
202 }
203 
204 void HIPDeviceQueue::assert_success(hipError_t result, const char *operation)
205 {
206  if (result != hipSuccess) {
207  const char *name = hipewErrorString(result);
208  hip_device_->set_error(
209  string_printf("%s in HIP queue %s (%s)", name, operation, debug_active_kernels().c_str()));
210  }
211 }
212 
213 unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create()
214 {
215  return make_unique<HIPDeviceGraphicsInterop>(this);
216 }
217 
219 
220 #endif /* WITH_HIP */
typedef float(TangentPoint)[2]
device_ptr device_pointer
#define CCL_NAMESPACE_END
Definition: cuda/compat.h:9
@ MEM_GLOBAL
@ MEM_TEXTURE
SyclQueue void void size_t num_bytes SyclQueue void const char void *memory_device_pointer KernelContext int kernel
static struct ImBuf * init_execution(const SeqRenderData *context, ImBuf *ibuf1, ImBuf *ibuf2, ImBuf *ibuf3)
Definition: effects.c:3519
ccl_gpu_kernel_postfix ccl_global const int ccl_global float const int work_size
int num_states
DeviceKernel
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
#define VLOG_DEVICE_STATS
Definition: log.h:83
string string_human_readable_size(size_t size)
Definition: string.cpp:229
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition: string.cpp:22
void * values[MAX_ARGS]
Definition: device/queue.h:35
float max
ccl_device_inline size_t divide_up(size_t x, size_t y)
Definition: util/types.h:51