Blender  V3.3
shader_eval.cpp
Go to the documentation of this file.
1 /* SPDX-License-Identifier: Apache-2.0
2  * Copyright 2011-2022 Blender Foundation */
3 
4 #include "integrator/shader_eval.h"
5 
6 #include "device/device.h"
7 #include "device/queue.h"
8 
9 #include "device/cpu/kernel.h"
11 
12 #include "util/log.h"
13 #include "util/progress.h"
14 #include "util/tbb.h"
15 
17 
18 ShaderEval::ShaderEval(Device *device, Progress &progress) : device_(device), progress_(progress)
19 {
20  DCHECK_NE(device_, nullptr);
21 }
22 
24  const int max_num_inputs,
25  const int num_channels,
26  const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
27  const function<void(device_vector<float> &)> &read_output)
28 {
29  bool first_device = true;
30  bool success = true;
31 
32  device_->foreach_device([&](Device *device) {
33  if (!first_device) {
34  VLOG_WORK << "Multi-devices are not yet fully implemented, will evaluate shader on a "
35  "single device.";
36  return;
37  }
38  first_device = false;
39 
40  device_vector<KernelShaderEvalInput> input(device, "ShaderEval input", MEM_READ_ONLY);
41  device_vector<float> output(device, "ShaderEval output", MEM_READ_WRITE);
42 
43  /* Allocate and copy device buffers. */
44  DCHECK_EQ(input.device, device);
45  DCHECK_EQ(output.device, device);
46  DCHECK_LE(output.size(), input.size());
47 
48  input.alloc(max_num_inputs);
49  int num_points = fill_input(input);
50  if (num_points == 0) {
51  return;
52  }
53 
54  input.copy_to_device();
55  output.alloc(num_points * num_channels);
56  output.zero_to_device();
57 
58  /* Evaluate on CPU or GPU. */
59  success = (device->info.type == DEVICE_CPU) ?
60  eval_cpu(device, type, input, output, num_points) :
61  eval_gpu(device, type, input, output, num_points);
62 
63  /* Copy data back from device if not canceled. */
64  if (success) {
65  output.copy_from_device(0, 1, output.size());
66  read_output(output);
67  }
68 
69  input.free();
70  output.free();
71  });
72 
73  return success;
74 }
75 
77  const ShaderEvalType type,
80  const int64_t work_size)
81 {
82  vector<CPUKernelThreadGlobals> kernel_thread_globals;
83  device->get_cpu_kernel_thread_globals(kernel_thread_globals);
84 
85  /* Find required kernel function. */
86  const CPUKernels &kernels = Device::get_cpu_kernels();
87 
88  /* Simple parallel_for over all work items. */
89  KernelShaderEvalInput *input_data = input.data();
90  float *output_data = output.data();
91  bool success = true;
92 
93  tbb::task_arena local_arena(device->info.cpu_threads);
94  local_arena.execute([&]() {
96  /* TODO: is this fast enough? */
97  if (progress_.get_cancel()) {
98  success = false;
99  return;
100  }
101 
102  const int thread_index = tbb::this_task_arena::current_thread_index();
103  const KernelGlobalsCPU *kg = &kernel_thread_globals[thread_index];
104 
105  switch (type) {
107  kernels.shader_eval_displace(kg, input_data, output_data, work_index);
108  break;
110  kernels.shader_eval_background(kg, input_data, output_data, work_index);
111  break;
113  kernels.shader_eval_curve_shadow_transparency(kg, input_data, output_data, work_index);
114  break;
115  }
116  });
117  });
118 
119  return success;
120 }
121 
123  const ShaderEvalType type,
126  const int64_t work_size)
127 {
128  /* Find required kernel function. */
130  switch (type) {
133  break;
136  break;
139  break;
140  };
141 
142  /* Create device queue. */
143  unique_ptr<DeviceQueue> queue = device->gpu_queue_create();
144  queue->init_execution();
145 
146  /* Execute work on GPU in chunk, so we can cancel.
147  * TODO: query appropriate size from device. */
148  const int32_t chunk_size = 65536;
149 
150  device_ptr d_input = input.device_pointer;
151  device_ptr d_output = output.device_pointer;
152 
153  assert(work_size <= 0x7fffffff);
154  for (int32_t d_offset = 0; d_offset < int32_t(work_size); d_offset += chunk_size) {
155  int32_t d_work_size = std::min(chunk_size, int32_t(work_size) - d_offset);
156 
157  DeviceKernelArguments args(&d_input, &d_output, &d_offset, &d_work_size);
158 
159  queue->enqueue(kernel, d_work_size, args);
160  queue->synchronize();
161 
162  if (progress_.get_cancel()) {
163  return false;
164  }
165  }
166 
167  return true;
168 }
169 
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum type
ShaderEvalFunction shader_eval_background
ShaderEvalFunction shader_eval_displace
ShaderEvalFunction shader_eval_curve_shadow_transparency
DeviceType type
Definition: device/device.h:62
static const CPUKernels & get_cpu_kernels()
virtual void get_cpu_kernel_thread_globals(vector< CPUKernelThreadGlobals > &)
virtual unique_ptr< DeviceQueue > gpu_queue_create()
virtual void foreach_device(const function< void(Device *)> &callback)
DeviceInfo info
bool get_cancel() const
Definition: progress.h:90
bool eval(const ShaderEvalType type, const int max_num_inputs, const int num_channels, const function< int(device_vector< KernelShaderEvalInput > &)> &fill_input, const function< void(device_vector< float > &)> &read_output)
Definition: shader_eval.cpp:23
ShaderEval(Device *device, Progress &progress)
Definition: shader_eval.cpp:18
bool eval_cpu(Device *device, const ShaderEvalType type, device_vector< KernelShaderEvalInput > &input, device_vector< float > &output, const int64_t work_size)
Definition: shader_eval.cpp:76
Progress & progress_
bool eval_gpu(Device *device, const ShaderEvalType type, device_vector< KernelShaderEvalInput > &input, device_vector< float > &output, const int64_t work_size)
#define CCL_NAMESPACE_END
Definition: cuda/compat.h:9
@ MEM_READ_WRITE
@ MEM_READ_ONLY
@ DEVICE_CPU
Definition: device/device.h:38
SyclQueue * queue
SyclQueue void void size_t num_bytes SyclQueue void const char void *memory_device_pointer KernelContext int kernel
@ SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY
@ SHADER_EVAL_BACKGROUND
@ SHADER_EVAL_DISPLACE
ccl_gpu_kernel_postfix ccl_global const int ccl_global float const int work_size
ccl_global KernelShaderEvalInput ccl_global float * output
ccl_global KernelShaderEvalInput * input
const int work_index
DeviceKernel
@ DEVICE_KERNEL_SHADER_EVAL_DISPLACE
@ DEVICE_KERNEL_SHADER_EVAL_BACKGROUND
@ DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY
#define DCHECK_EQ(a, b)
Definition: log.h:64
#define VLOG_WORK
Definition: log.h:80
#define DCHECK_LE(a, b)
Definition: log.h:67
#define DCHECK_NE(a, b)
Definition: log.h:63
static const int chunk_size
void parallel_for(IndexRange range, int64_t grain_size, const Function &function)
Definition: BLI_task.hh:51
#define min(a, b)
Definition: sort.c:35
__int64 int64_t
Definition: stdint.h:89
signed int int32_t
Definition: stdint.h:77
uint64_t device_ptr
Definition: util/types.h:43