15 #define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
16 #define GPU_MULTIPROCESSOR_MAX_BLOCKS 64
17 #define GPU_BLOCK_MAX_THREADS 1024
18 #define GPU_THREAD_MAX_REGISTERS 255
20 #define GPU_KERNEL_BLOCK_NUM_THREADS 1024
21 #define GPU_KERNEL_MAX_REGISTERS 64
25 #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
26 extern "C" __global__ void __launch_bounds__(block_num_threads, \
27 GPU_MULTIPRESSOR_MAX_REGISTERS / \
28 (block_num_threads * thread_num_registers))
30 #define ccl_gpu_kernel_threads(block_num_threads) \
31 extern "C" __global__ void __launch_bounds__(block_num_threads)
33 #define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
34 #define ccl_gpu_kernel_postfix
36 #define ccl_gpu_kernel_call(x) x
40 #define ccl_gpu_kernel_lambda(func, ...) \
41 struct KernelLambda { \
43 __device__ int operator()(const int state) \
47 } ccl_gpu_kernel_lambda_pass
51 #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
52 # error "Maximum number of threads per block exceeded"
55 #if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
56 GPU_MULTIPROCESSOR_MAX_BLOCKS
57 # error "Maximum number of blocks per multiprocessor exceeded"
60 #if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
61 # error "Maximum number of registers per thread exceeded"