15 #if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
16 # define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
17 # define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
18 # define GPU_BLOCK_MAX_THREADS 1024
19 # define GPU_THREAD_MAX_REGISTERS 63
22 # define GPU_KERNEL_BLOCK_NUM_THREADS 256
23 # define GPU_KERNEL_MAX_REGISTERS 63
26 #elif __CUDA_ARCH__ == 320
27 # define GPU_MULTIPRESSOR_MAX_REGISTERS 32768
28 # define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
29 # define GPU_BLOCK_MAX_THREADS 1024
30 # define GPU_THREAD_MAX_REGISTERS 63
33 # define GPU_KERNEL_BLOCK_NUM_THREADS 256
34 # define GPU_KERNEL_MAX_REGISTERS 63
37 #elif __CUDA_ARCH__ == 370
38 # define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
39 # define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
40 # define GPU_BLOCK_MAX_THREADS 1024
41 # define GPU_THREAD_MAX_REGISTERS 255
44 # define GPU_KERNEL_BLOCK_NUM_THREADS 256
45 # define GPU_KERNEL_MAX_REGISTERS 63
48 #elif __CUDA_ARCH__ <= 699
49 # define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
50 # define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
51 # define GPU_BLOCK_MAX_THREADS 1024
52 # define GPU_THREAD_MAX_REGISTERS 255
55 # define GPU_KERNEL_BLOCK_NUM_THREADS 256
58 # if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
59 # define GPU_KERNEL_MAX_REGISTERS 64
61 # define GPU_KERNEL_MAX_REGISTERS 48
65 #elif __CUDA_ARCH__ <= 899
66 # define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
67 # define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
68 # define GPU_BLOCK_MAX_THREADS 1024
69 # define GPU_THREAD_MAX_REGISTERS 255
72 # define GPU_KERNEL_BLOCK_NUM_THREADS 512
73 # define GPU_KERNEL_MAX_REGISTERS 96
77 # error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
82 #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
83 extern "C" __global__ void __launch_bounds__(block_num_threads, \
84 GPU_MULTIPRESSOR_MAX_REGISTERS / \
85 (block_num_threads * thread_num_registers))
87 #define ccl_gpu_kernel_threads(block_num_threads) \
88 extern "C" __global__ void __launch_bounds__(block_num_threads)
90 #define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
91 #define ccl_gpu_kernel_postfix
93 #define ccl_gpu_kernel_call(x) x
97 #define ccl_gpu_kernel_lambda(func, ...) \
98 struct KernelLambda { \
100 __device__ int operator()(const int state) \
104 } ccl_gpu_kernel_lambda_pass
108 #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
109 # error "Maximum number of threads per block exceeded"
112 #if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
113 GPU_MULTIPROCESSOR_MAX_BLOCKS
114 # error "Maximum number of blocks per multiprocessor exceeded"
117 #if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
118 # error "Maximum number of registers per thread exceeded"