Blender  V3.3
kernel.mm
Go to the documentation of this file.
1 /* SPDX-License-Identifier: Apache-2.0
2  * Copyright 2021-2022 Blender Foundation */
3 
4 #ifdef WITH_METAL
5 
6 # include "device/metal/kernel.h"
9 # include "util/md5.h"
10 # include "util/path.h"
11 # include "util/tbb.h"
12 # include "util/time.h"
13 # include "util/unique_ptr.h"
14 
16 
17 /* limit to 2 MTLCompiler instances */
18 int max_mtlcompiler_threads = 2;
19 
20 const char *kernel_type_as_string(MetalPipelineType pso_type)
21 {
22  switch (pso_type) {
23  case PSO_GENERIC:
24  return "PSO_GENERIC";
25  case PSO_SPECIALIZED_INTERSECT:
26  return "PSO_SPECIALIZED_INTERSECT";
27  case PSO_SPECIALIZED_SHADE:
28  return "PSO_SPECIALIZED_SHADE";
29  default:
30  assert(0);
31  }
32  return "";
33 }
34 
35 bool kernel_has_intersection(DeviceKernel device_kernel)
36 {
37  return (device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
43 }
44 
45 struct ShaderCache {
46  ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice)
47  {
48  }
49  ~ShaderCache();
50 
51  /* Get the fastest available pipeline for the specified kernel. */
52  MetalKernelPipeline *get_best_pipeline(DeviceKernel kernel, const MetalDevice *device);
53 
54  /* Non-blocking request for a kernel, optionally specialized to the scene being rendered by
55  * device. */
56  void load_kernel(DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type);
57 
58  bool should_load_kernel(DeviceKernel device_kernel,
59  MetalDevice *device,
60  MetalPipelineType pso_type);
61 
62  void wait_for_all();
63 
64  private:
65  friend ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice);
66 
67  void compile_thread_func(int thread_index);
68 
69  using PipelineCollection = std::vector<unique_ptr<MetalKernelPipeline>>;
70 
71  struct PipelineRequest {
72  MetalKernelPipeline *pipeline = nullptr;
73  std::function<void(MetalKernelPipeline *)> completionHandler;
74  };
75 
76  std::mutex cache_mutex;
77 
78  PipelineCollection pipelines[DEVICE_KERNEL_NUM];
79  id<MTLDevice> mtlDevice;
80 
81  bool running = false;
82  std::condition_variable cond_var;
83  std::deque<PipelineRequest> request_queue;
84  std::vector<std::thread> compile_threads;
85  std::atomic_int incomplete_requests = 0;
86 };
87 
88 std::mutex g_shaderCacheMutex;
89 std::map<id<MTLDevice>, unique_ptr<ShaderCache>> g_shaderCache;
90 
91 ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice)
92 {
93  thread_scoped_lock lock(g_shaderCacheMutex);
94  auto it = g_shaderCache.find(mtlDevice);
95  if (it != g_shaderCache.end()) {
96  return it->second.get();
97  }
98 
99  g_shaderCache[mtlDevice] = make_unique<ShaderCache>(mtlDevice);
100  return g_shaderCache[mtlDevice].get();
101 }
102 
103 ShaderCache::~ShaderCache()
104 {
105  metal_printf("ShaderCache shutting down with incomplete_requests = %d\n",
106  int(incomplete_requests));
107 
108  running = false;
109  cond_var.notify_all();
110  for (auto &thread : compile_threads) {
111  thread.join();
112  }
113 }
114 
115 void ShaderCache::wait_for_all()
116 {
117  while (incomplete_requests > 0) {
118  std::this_thread::sleep_for(std::chrono::milliseconds(100));
119  }
120 }
121 
122 void ShaderCache::compile_thread_func(int thread_index)
123 {
124  while (1) {
125 
126  /* wait for / acquire next request */
127  PipelineRequest request;
128  {
129  thread_scoped_lock lock(cache_mutex);
130  cond_var.wait(lock, [&] { return !running || !request_queue.empty(); });
131  if (!running) {
132  break;
133  }
134 
135  if (!request_queue.empty()) {
136  request = request_queue.front();
137  request_queue.pop_front();
138  }
139  }
140 
141  /* service request */
142  if (request.pipeline) {
143  request.pipeline->compile();
144  incomplete_requests--;
145  }
146  }
147 }
148 
149 bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
150  MetalDevice *device,
151  MetalPipelineType pso_type)
152 {
153  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
154  /* Skip megakernel. */
155  return false;
156  }
157 
159  if ((device->kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) {
160  /* Skip shade_surface_raytrace kernel if the scene doesn't require it. */
161  return false;
162  }
163  }
164 
165  if (pso_type != PSO_GENERIC) {
166  /* Only specialize kernels where it can make an impact. */
167  if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
168  device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
169  return false;
170  }
171 
172  /* Only specialize shading / intersection kernels as requested. */
173  bool is_shade_kernel = (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
174  bool is_shade_pso = (pso_type == PSO_SPECIALIZED_SHADE);
175  if (is_shade_pso != is_shade_kernel) {
176  return false;
177  }
178  }
179 
180  {
181  /* check whether the kernel has already been requested / cached */
182  thread_scoped_lock lock(cache_mutex);
183  for (auto &pipeline : pipelines[device_kernel]) {
184  if (pipeline->source_md5 == device->source_md5[pso_type]) {
185  return false;
186  }
187  }
188  }
189 
190  return true;
191 }
192 
193 void ShaderCache::load_kernel(DeviceKernel device_kernel,
194  MetalDevice *device,
195  MetalPipelineType pso_type)
196 {
197  {
198  /* create compiler threads on first run */
199  thread_scoped_lock lock(cache_mutex);
200  if (compile_threads.empty()) {
201  running = true;
202  for (int i = 0; i < max_mtlcompiler_threads; i++) {
203  compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
204  }
205  }
206  }
207 
208  if (!should_load_kernel(device_kernel, device, pso_type)) {
209  return;
210  }
211 
212  incomplete_requests++;
213 
214  PipelineRequest request;
215  request.pipeline = new MetalKernelPipeline;
216  memcpy(&request.pipeline->kernel_data_,
217  &device->launch_params.data,
218  sizeof(request.pipeline->kernel_data_));
219  request.pipeline->pso_type = pso_type;
220  request.pipeline->mtlDevice = mtlDevice;
221  request.pipeline->source_md5 = device->source_md5[pso_type];
222  request.pipeline->mtlLibrary = device->mtlLibrary[pso_type];
223  request.pipeline->device_kernel = device_kernel;
224  request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
225 
226  /* metalrt options */
227  request.pipeline->use_metalrt = device->use_metalrt;
228  request.pipeline->metalrt_features = device->use_metalrt ?
229  (device->kernel_features & METALRT_FEATURE_MASK) :
230  0;
231 
232  {
233  thread_scoped_lock lock(cache_mutex);
234  auto &collection = pipelines[device_kernel];
235 
236  /* Cache up to 3 kernel variants with the same pso_type, purging oldest first. */
237  int max_entries_of_same_pso_type = 3;
238  for (int i = (int)collection.size() - 1; i >= 0; i--) {
239  if (collection[i]->pso_type == pso_type) {
240  max_entries_of_same_pso_type -= 1;
241  if (max_entries_of_same_pso_type == 0) {
242  metal_printf("Purging oldest %s:%s kernel from ShaderCache\n",
243  kernel_type_as_string(pso_type),
244  device_kernel_as_string(device_kernel));
245  collection.erase(collection.begin() + i);
246  break;
247  }
248  }
249  }
250 
251  collection.push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
252  request_queue.push_back(request);
253  }
254  cond_var.notify_one();
255 }
256 
257 MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const MetalDevice *device)
258 {
259  thread_scoped_lock lock(cache_mutex);
260  auto &collection = pipelines[kernel];
261  if (collection.empty()) {
262  return nullptr;
263  }
264 
265  /* metalrt options */
266  bool use_metalrt = device->use_metalrt;
267  bool device_metalrt_hair = use_metalrt && device->kernel_features & KERNEL_FEATURE_HAIR;
268  bool device_metalrt_hair_thick = use_metalrt &&
269  device->kernel_features & KERNEL_FEATURE_HAIR_THICK;
270  bool device_metalrt_pointcloud = use_metalrt &&
271  device->kernel_features & KERNEL_FEATURE_POINTCLOUD;
272  bool device_metalrt_motion = use_metalrt &&
273  device->kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
274 
275  MetalKernelPipeline *best_pipeline = nullptr;
276  for (auto &pipeline : collection) {
277  if (!pipeline->loaded) {
278  /* still loading - ignore */
279  continue;
280  }
281 
282  bool pipeline_metalrt_hair = pipeline->metalrt_features & KERNEL_FEATURE_HAIR;
283  bool pipeline_metalrt_hair_thick = pipeline->metalrt_features & KERNEL_FEATURE_HAIR_THICK;
284  bool pipeline_metalrt_pointcloud = pipeline->metalrt_features & KERNEL_FEATURE_POINTCLOUD;
285  bool pipeline_metalrt_motion = use_metalrt &&
286  pipeline->metalrt_features & KERNEL_FEATURE_OBJECT_MOTION;
287 
288  if (pipeline->use_metalrt != use_metalrt || pipeline_metalrt_hair != device_metalrt_hair ||
289  pipeline_metalrt_hair_thick != device_metalrt_hair_thick ||
290  pipeline_metalrt_pointcloud != device_metalrt_pointcloud ||
291  pipeline_metalrt_motion != device_metalrt_motion) {
292  /* wrong combination of metalrt options */
293  continue;
294  }
295 
296  if (pipeline->pso_type != PSO_GENERIC) {
297  if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] ||
298  pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) {
299  best_pipeline = pipeline.get();
300  }
301  }
302  else if (!best_pipeline) {
303  best_pipeline = pipeline.get();
304  }
305  }
306 
307  if (best_pipeline->usage_count == 0 && best_pipeline->pso_type != PSO_GENERIC) {
308  metal_printf("Swapping in %s version of %s\n",
309  kernel_type_as_string(best_pipeline->pso_type),
311  }
312  best_pipeline->usage_count += 1;
313 
314  return best_pipeline;
315 }
316 
317 bool MetalKernelPipeline::should_use_binary_archive() const
318 {
319  /* Issues with binary archives in older macOS versions. */
320  if (@available(macOS 13.0, *)) {
321  if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
322  if (atoi(str) != 0) {
323  /* Don't archive if we have opted out by env var. */
324  return false;
325  }
326  }
327 
328  /* Workaround for Intel GPU having issue using Binary Archives */
329  MetalGPUVendor gpu_vendor = MetalInfo::get_device_vendor(mtlDevice);
330  if (gpu_vendor == METAL_GPU_INTEL) {
331  return false;
332  }
333 
334  if (pso_type == PSO_GENERIC) {
335  /* Archive the generic kernels. */
336  return true;
337  }
338 
339  if (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND &&
340  device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
341  /* Archive all shade kernels - they take a long time to compile. */
342  return true;
343  }
344  }
345 
346  /* The remaining kernels are all fast to compile. They may get cached by the system shader cache,
347  * but will be quick to regenerate if not. */
348  return false;
349 }
350 
351 static MTLFunctionConstantValues *GetConstantValues(KernelData const *data = nullptr)
352 {
353  MTLFunctionConstantValues *constant_values = [MTLFunctionConstantValues new];
354 
355  MTLDataType MTLDataType_int = MTLDataTypeInt;
356  MTLDataType MTLDataType_float = MTLDataTypeFloat;
357  MTLDataType MTLDataType_float4 = MTLDataTypeFloat4;
358  KernelData zero_data = {0};
359  if (!data) {
360  data = &zero_data;
361  }
362  int zero_int = 0;
363  [constant_values setConstantValue:&zero_int type:MTLDataType_int atIndex:Kernel_DummyConstant];
364 
365 # define KERNEL_STRUCT_MEMBER(parent, _type, name) \
366  [constant_values setConstantValue:&data->parent.name \
367  type:MTLDataType_##_type \
368  atIndex:KernelData_##parent##_##name];
369 
370 # include "kernel/data_template.h"
371 
372  return constant_values;
373 }
374 
375 void MetalKernelPipeline::compile()
376 {
377  const std::string function_name = std::string("cycles_metal_") +
378  device_kernel_as_string(device_kernel);
379 
380  int threads_per_threadgroup = this->threads_per_threadgroup;
381  if (device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL &&
382  device_kernel < DEVICE_KERNEL_INTEGRATOR_RESET) {
383  /* Always use 512 for the sorting kernels */
384  threads_per_threadgroup = 512;
385  }
386 
387  NSString *entryPoint = [@(function_name.c_str()) copy];
388 
389  NSError *error = NULL;
390  if (@available(macOS 11.0, *)) {
391  MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
392  func_desc.name = entryPoint;
393 
394  if (pso_type != PSO_GENERIC) {
395  func_desc.constantValues = GetConstantValues(&kernel_data_);
396  }
397  else {
398  func_desc.constantValues = GetConstantValues();
399  }
400 
401  function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error];
402  }
403 
404  [entryPoint release];
405 
406  if (function == nil) {
407  NSString *err = [error localizedDescription];
408  string errors = [err UTF8String];
409  metal_printf("Error getting function \"%s\": %s", function_name.c_str(), errors.c_str());
410  return;
411  }
412 
413  function.label = [entryPoint copy];
414 
415  if (use_metalrt) {
416  if (@available(macOS 11.0, *)) {
417  /* create the id<MTLFunction> for each intersection function */
418  const char *function_names[] = {
419  "__anyhit__cycles_metalrt_visibility_test_tri",
420  "__anyhit__cycles_metalrt_visibility_test_box",
421  "__anyhit__cycles_metalrt_shadow_all_hit_tri",
422  "__anyhit__cycles_metalrt_shadow_all_hit_box",
423  "__anyhit__cycles_metalrt_local_hit_tri",
424  "__anyhit__cycles_metalrt_local_hit_box",
425  "__intersection__curve_ribbon",
426  "__intersection__curve_ribbon_shadow",
427  "__intersection__curve_all",
428  "__intersection__curve_all_shadow",
429  "__intersection__point",
430  "__intersection__point_shadow",
431  };
432  assert(sizeof(function_names) / sizeof(function_names[0]) == METALRT_FUNC_NUM);
433 
434  MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
435  for (int i = 0; i < METALRT_FUNC_NUM; i++) {
436  const char *function_name = function_names[i];
437  desc.name = [@(function_name) copy];
438 
439  if (pso_type != PSO_GENERIC) {
440  desc.constantValues = GetConstantValues(&kernel_data_);
441  }
442  else {
443  desc.constantValues = GetConstantValues();
444  }
445 
446  NSError *error = NULL;
447  rt_intersection_function[i] = [mtlLibrary newFunctionWithDescriptor:desc error:&error];
448 
449  if (rt_intersection_function[i] == nil) {
450  NSString *err = [error localizedDescription];
451  string errors = [err UTF8String];
452 
453  error_str = string_printf(
454  "Error getting intersection function \"%s\": %s", function_name, errors.c_str());
455  break;
456  }
457 
458  rt_intersection_function[i].label = [@(function_name) copy];
459  }
460  }
461  }
462 
463  NSArray *table_functions[METALRT_TABLE_NUM] = {nil};
464  NSArray *linked_functions = nil;
465 
466  bool metalrt_hair = use_metalrt && (metalrt_features & KERNEL_FEATURE_HAIR);
467  bool metalrt_hair_thick = use_metalrt && (metalrt_features & KERNEL_FEATURE_HAIR_THICK);
468  bool metalrt_pointcloud = use_metalrt && (metalrt_features & KERNEL_FEATURE_POINTCLOUD);
469 
470  if (use_metalrt) {
471  id<MTLFunction> curve_intersect_default = nil;
472  id<MTLFunction> curve_intersect_shadow = nil;
473  id<MTLFunction> point_intersect_default = nil;
474  id<MTLFunction> point_intersect_shadow = nil;
475  if (metalrt_hair) {
476  /* Add curve intersection programs. */
477  if (metalrt_hair_thick) {
478  /* Slower programs for thick hair since that also slows down ribbons.
479  * Ideally this should not be needed. */
480  curve_intersect_default = rt_intersection_function[METALRT_FUNC_CURVE_ALL];
481  curve_intersect_shadow = rt_intersection_function[METALRT_FUNC_CURVE_ALL_SHADOW];
482  }
483  else {
484  curve_intersect_default = rt_intersection_function[METALRT_FUNC_CURVE_RIBBON];
485  curve_intersect_shadow = rt_intersection_function[METALRT_FUNC_CURVE_RIBBON_SHADOW];
486  }
487  }
488  if (metalrt_pointcloud) {
489  point_intersect_default = rt_intersection_function[METALRT_FUNC_POINT];
490  point_intersect_shadow = rt_intersection_function[METALRT_FUNC_POINT_SHADOW];
491  }
492  table_functions[METALRT_TABLE_DEFAULT] = [NSArray
493  arrayWithObjects:rt_intersection_function[METALRT_FUNC_DEFAULT_TRI],
494  curve_intersect_default ?
495  curve_intersect_default :
496  rt_intersection_function[METALRT_FUNC_DEFAULT_BOX],
497  point_intersect_default ?
498  point_intersect_default :
499  rt_intersection_function[METALRT_FUNC_DEFAULT_BOX],
500  nil];
501  table_functions[METALRT_TABLE_SHADOW] = [NSArray
502  arrayWithObjects:rt_intersection_function[METALRT_FUNC_SHADOW_TRI],
503  curve_intersect_shadow ?
504  curve_intersect_shadow :
505  rt_intersection_function[METALRT_FUNC_SHADOW_BOX],
506  point_intersect_shadow ?
507  point_intersect_shadow :
508  rt_intersection_function[METALRT_FUNC_SHADOW_BOX],
509  nil];
510  table_functions[METALRT_TABLE_LOCAL] = [NSArray
511  arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI],
512  rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
513  rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
514  nil];
515 
516  NSMutableSet *unique_functions = [NSMutableSet
517  setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
518  [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
519  [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
520 
521  if (kernel_has_intersection(device_kernel)) {
522  linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]
523  sortedArrayUsingComparator:^NSComparisonResult(id<MTLFunction> f1, id<MTLFunction> f2) {
524  return [f1.label compare:f2.label];
525  }];
526  }
527  unique_functions = nil;
528  }
529 
530  MTLComputePipelineDescriptor *computePipelineStateDescriptor =
531  [[MTLComputePipelineDescriptor alloc] init];
532 
533  computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable;
534  computePipelineStateDescriptor.buffers[1].mutability = MTLMutabilityImmutable;
535  computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable;
536 
537  if (@available(macos 10.14, *)) {
538  computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = threads_per_threadgroup;
539  }
540  computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true;
541 
542  computePipelineStateDescriptor.computeFunction = function;
543 
544  if (@available(macOS 11.0, *)) {
545  /* Attach the additional functions to an MTLLinkedFunctions object */
546  if (linked_functions) {
547  computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc] init];
548  computePipelineStateDescriptor.linkedFunctions.functions = linked_functions;
549  }
550  computePipelineStateDescriptor.maxCallStackDepth = 1;
551  if (use_metalrt) {
552  computePipelineStateDescriptor.maxCallStackDepth = 8;
553  }
554  }
555 
556  MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
557 
558  bool use_binary_archive = should_use_binary_archive();
559 
560  id<MTLBinaryArchive> archive = nil;
561  string metalbin_path;
562  string metalbin_name;
563  if (use_binary_archive) {
564  NSProcessInfo *processInfo = [NSProcessInfo processInfo];
565  string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
566  MD5Hash local_md5;
567  local_md5.append(source_md5);
568  local_md5.append(osVersion);
569  local_md5.append((uint8_t *)&this->threads_per_threadgroup,
570  sizeof(this->threads_per_threadgroup));
571 
572  string options;
573  if (use_metalrt && kernel_has_intersection(device_kernel)) {
574  /* incorporate any MetalRT specializations into the archive name */
575  options += string_printf(".hair_%d.hair_thick_%d.pointcloud_%d",
576  metalrt_hair ? 1 : 0,
577  metalrt_hair_thick ? 1 : 0,
578  metalrt_pointcloud ? 1 : 0);
579  }
580 
581  /* Replace non-alphanumerical characters with underscores. */
582  string device_name = [mtlDevice.name UTF8String];
583  for (char &c : device_name) {
584  if ((c < '0' || c > '9') && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) {
585  c = '_';
586  }
587  }
588 
589  metalbin_name = device_name;
590  metalbin_name = path_join(metalbin_name, device_kernel_as_string(device_kernel));
591  metalbin_name = path_join(metalbin_name, kernel_type_as_string(pso_type));
592  metalbin_name = path_join(metalbin_name, local_md5.get_hex() + options + ".bin");
593 
594  metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
595  path_create_directories(metalbin_path);
596 
597  if (path_exists(metalbin_path) && use_binary_archive) {
598  if (@available(macOS 11.0, *)) {
599  MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
600  archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
601  archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
602  [archiveDesc release];
603  }
604  }
605  }
606 
607  __block bool creating_new_archive = false;
608  if (@available(macOS 11.0, *)) {
609  if (use_binary_archive) {
610  if (!archive) {
611  MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
612  archiveDesc.url = nil;
613  archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
614  creating_new_archive = true;
615  }
616  computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil];
617  pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss;
618  }
619  }
620 
621  double starttime = time_dt();
622 
623  MTLNewComputePipelineStateWithReflectionCompletionHandler completionHandler = ^(
624  id<MTLComputePipelineState> computePipelineState,
625  MTLComputePipelineReflection *reflection,
626  NSError *error) {
627  bool recreate_archive = false;
628  if (computePipelineState == nil && archive) {
629  NSString *errStr = [error localizedDescription];
630  metal_printf(
631  "Failed to create compute pipeline state \"%s\" from archive - attempting recreation... "
632  "(error: %s)\n",
633  device_kernel_as_string((DeviceKernel)device_kernel),
634  errStr ? [errStr UTF8String] : "nil");
635  computePipelineState = [mtlDevice
636  newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
637  options:MTLPipelineOptionNone
638  reflection:nullptr
639  error:&error];
640  recreate_archive = true;
641  }
642 
643  double duration = time_dt() - starttime;
644 
645  if (computePipelineState == nil) {
646  NSString *errStr = [error localizedDescription];
647  error_str = string_printf("Failed to create compute pipeline state \"%s\", error: \n",
648  device_kernel_as_string((DeviceKernel)device_kernel));
649  error_str += (errStr ? [errStr UTF8String] : "nil");
650  metal_printf("%16s | %2d | %-55s | %7.2fs | FAILED!\n",
651  kernel_type_as_string(pso_type),
652  device_kernel,
653  device_kernel_as_string((DeviceKernel)device_kernel),
654  duration);
655  return;
656  }
657 
658  int num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
659  computePipelineState.threadExecutionWidth);
660  num_threads_per_block = std::max(num_threads_per_block,
661  (int)computePipelineState.threadExecutionWidth);
662  this->pipeline = computePipelineState;
663  this->num_threads_per_block = num_threads_per_block;
664 
665  if (@available(macOS 11.0, *)) {
666  if (creating_new_archive || recreate_archive) {
667  if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())]
668  error:&error]) {
669  metal_printf("Failed to save binary archive, error:\n%s\n",
670  [[error localizedDescription] UTF8String]);
671  }
672  }
673  }
674  };
675 
676  /* Block on load to ensure we continue with a valid kernel function */
677  if (creating_new_archive) {
678  starttime = time_dt();
679  NSError *error;
680  if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
681  error:&error]) {
682  NSString *errStr = [error localizedDescription];
683  metal_printf("Failed to add PSO to archive:\n%s\n", errStr ? [errStr UTF8String] : "nil");
684  }
685  }
686  id<MTLComputePipelineState> pipeline = [mtlDevice
687  newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
688  options:pipelineOptions
689  reflection:nullptr
690  error:&error];
691  completionHandler(pipeline, nullptr, error);
692 
693  this->loaded = true;
694  [computePipelineStateDescriptor release];
695  computePipelineStateDescriptor = nil;
696 
697  if (use_metalrt && linked_functions) {
698  for (int table = 0; table < METALRT_TABLE_NUM; table++) {
699  if (@available(macOS 11.0, *)) {
700  MTLIntersectionFunctionTableDescriptor *ift_desc =
701  [[MTLIntersectionFunctionTableDescriptor alloc] init];
702  ift_desc.functionCount = table_functions[table].count;
703  intersection_func_table[table] = [this->pipeline
704  newIntersectionFunctionTableWithDescriptor:ift_desc];
705 
706  /* Finally write the function handles into this pipeline's table */
707  int size = (int)[table_functions[table] count];
708  for (int i = 0; i < size; i++) {
709  id<MTLFunctionHandle> handle = [pipeline
710  functionHandleWithFunction:table_functions[table][i]];
711  [intersection_func_table[table] setFunction:handle atIndex:i];
712  }
713  }
714  }
715  }
716 
717  double duration = time_dt() - starttime;
718 
719  if (!use_binary_archive) {
720  metal_printf("%16s | %2d | %-55s | %7.2fs\n",
721  kernel_type_as_string(pso_type),
722  int(device_kernel),
723  device_kernel_as_string(device_kernel),
724  duration);
725  }
726  else {
727  metal_printf("%16s | %2d | %-55s | %7.2fs | %s: %s\n",
728  kernel_type_as_string(pso_type),
729  device_kernel,
730  device_kernel_as_string((DeviceKernel)device_kernel),
731  duration,
732  creating_new_archive ? " new" : "load",
733  metalbin_name.c_str());
734  }
735 }
736 
737 bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type)
738 {
739  const double starttime = time_dt();
740  auto shader_cache = get_shader_cache(device->mtlDevice);
741  for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
742  shader_cache->load_kernel((DeviceKernel)i, device, pso_type);
743  }
744 
745  shader_cache->wait_for_all();
746  metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
747  time_dt() - starttime,
748  kernel_type_as_string(pso_type));
749  return true;
750 }
751 
752 bool MetalDeviceKernels::should_load_kernels(MetalDevice *device, MetalPipelineType pso_type)
753 {
754  auto shader_cache = get_shader_cache(device->mtlDevice);
755  for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
756  if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) {
757  return true;
758  }
759  }
760  return false;
761 }
762 
763 const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device,
765 {
766  return get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device);
767 }
768 
770 
771 #endif /* WITH_METAL*/
ThreadMutex mutex
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum type
volatile int lock
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition: btDbvt.cpp:52
Definition: md5.h:20
string get_hex()
Definition: md5.cpp:348
void append(const uint8_t *data, int size)
Definition: md5.cpp:256
Definition: thread.h:34
bool join()
Definition: thread.cpp:42
#define CCL_NAMESPACE_END
Definition: cuda/compat.h:9
CCL_NAMESPACE_BEGIN struct Options options
CCL_NAMESPACE_BEGIN const char * device_kernel_as_string(DeviceKernel kernel)
SyclQueue void void size_t num_bytes SyclQueue void const char void *memory_device_pointer KernelContext int kernel
SyclQueue void void size_t num_bytes void
#define str(s)
@ Kernel_DummyConstant
int count
@ KERNEL_FEATURE_OBJECT_MOTION
@ KERNEL_FEATURE_HAIR
@ KERNEL_FEATURE_HAIR_THICK
@ KERNEL_FEATURE_POINTCLOUD
@ KERNEL_FEATURE_NODE_RAYTRACE
DeviceKernel
@ DEVICE_KERNEL_INTEGRATOR_RESET
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL
@ DEVICE_KERNEL_NUM
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
@ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND
static void error(const char *str)
Definition: meshlaplacian.c:51
static unsigned c
Definition: RandGen.cpp:83
static void copy(bNodeTree *dest_ntree, bNode *dest_node, const bNode *src_node)
string path_cache_get(const string &sub)
Definition: path.cpp:358
string path_join(const string &dir, const string &file)
Definition: path.cpp:413
bool path_exists(const string &path)
Definition: path.cpp:559
void path_create_directories(const string &filepath)
Definition: path.cpp:642
unsigned char uint8_t
Definition: stdint.h:78
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition: string.cpp:22
std::unique_lock< std::mutex > thread_scoped_lock
Definition: thread.h:28
CCL_NAMESPACE_BEGIN double time_dt()
Definition: time.cpp:35
float max
ccl_device_inline size_t round_down(size_t x, size_t multiple)
Definition: util/types.h:61
static FT_Error err