1#ifndef STAN_MATH_OPENCL_OPENCL_CONTEXT_HPP
2#define STAN_MATH_OPENCL_OPENCL_CONTEXT_HPP
5#define DEVICE_FILTER CL_DEVICE_TYPE_ALL
6#ifndef OPENCL_DEVICE_ID
7#error OPENCL_DEVICE_ID_NOT_SET
9#ifndef OPENCL_PLATFORM_ID
10#error OPENCL_PLATFORM_ID_NOT_SET
16#include <CL/opencl.hpp>
17#include <tbb/concurrent_vector.h>
21#include <unordered_map>
76 int device_id = OPENCL_DEVICE_ID) {
82 "CL_INVALID_PLATFORM");
89 "CL_DEVICE_NOT_FOUND");
97 cl_command_queue_properties device_properties;
98 device_[0].getInfo<cl_command_queue_properties>(
99 CL_DEVICE_QUEUE_PROPERTIES, &device_properties);
100 device_[0].getInfo<
size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE,
102 std::vector<size_t> max_wg_sizes
103 =
device_[0].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
104 if (max_wg_sizes.size() < 3) {
106 "The device does not support 3D work groups!");
110 if (device_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
113 CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
nullptr);
119 int max_square_block_size
120 = std::min({max_wg_sizes[0], max_wg_sizes[1],
127 if (max_square_block_size <
base_opts_[
"THREAD_BLOCK_SIZE"]) {
128 base_opts_[
"THREAD_BLOCK_SIZE"] = max_square_block_size;
135 / std::log(
base_opts_[
"REDUCTION_STEP_SIZE"]);
137 = std::pow(
base_opts_[
"REDUCTION_STEP_SIZE"], p);
146 }
catch (
const cl::Error&
e) {
170 {
"THREAD_BLOCK_SIZE", 32},
171 {
"WORK_PER_THREAD", 8},
172 {
"REDUCTION_STEP_SIZE", 4},
173 {
"LOCAL_SIZE_", 4096}};
222 std::ostringstream msg;
224 msg <<
"Platform ID: " << OPENCL_DEVICE_ID <<
"\n";
225 msg <<
"Platform Name: "
228 .getInfo<CL_PLATFORM_NAME>()
230 msg <<
"Platform Vendor: "
233 .getInfo<CL_PLATFORM_VENDOR>()
235 msg <<
"\tDevice " << OPENCL_DEVICE_ID <<
": "
237 msg <<
"\t\tDevice Name: "
240 .getInfo<CL_DEVICE_NAME>()
242 msg <<
"\t\tDevice Type: "
245 .getInfo<CL_DEVICE_TYPE>()
247 msg <<
"\t\tDevice Vendor: "
250 .getInfo<CL_DEVICE_VENDOR>()
252 msg <<
"\t\tDevice Max Compute Units: "
255 .getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()
257 msg <<
"\t\tDevice Global Memory: "
260 .getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()
262 msg <<
"\t\tDevice Max Clock Frequency: "
265 .getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>()
267 msg <<
"\t\tDevice Max Allocateable Memory: "
270 .getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>()
272 msg <<
"\t\tDevice Local Memory: "
275 .getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()
277 msg <<
"\t\tDevice Available: "
280 .getInfo<CL_DEVICE_AVAILABLE>()
291 std::vector<cl::Platform> all_platforms;
292 cl::Platform::get(&all_platforms);
293 std::ostringstream msg;
297 msg <<
"Number of Platforms: " << all_platforms.size() <<
"\n";
298 for (
auto plat_iter : all_platforms) {
301 msg <<
"Platform ID: " << platform_id++ <<
"\n";
302 msg <<
"Platform Name: " <<
platform.getInfo<CL_PLATFORM_NAME>() <<
"\n";
303 msg <<
"Platform Vendor: " <<
platform.getInfo<CL_PLATFORM_VENDOR>()
307 std::vector<cl::Device> all_devices;
308 platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
310 for (
auto device_iter : all_devices) {
311 cl::Device
device(device_iter);
313 msg <<
"\tDevice " << device_id++ <<
": "
315 msg <<
"\t\tDevice Name: " <<
device.getInfo<CL_DEVICE_NAME>()
317 msg <<
"\t\tDevice Type: " <<
device.getInfo<CL_DEVICE_TYPE>()
319 msg <<
"\t\tDevice Vendor: " <<
device.getInfo<CL_DEVICE_VENDOR>()
321 msg <<
"\t\tDevice Max Compute Units: "
322 <<
device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() <<
"\n";
323 msg <<
"\t\tDevice Global Memory: "
324 <<
device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() <<
"\n";
325 msg <<
"\t\tDevice Max Clock Frequency: "
326 <<
device.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>() <<
"\n";
327 msg <<
"\t\tDevice Max Allocateable Memory: "
328 <<
device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>() <<
"\n";
329 msg <<
"\t\tDevice Local Memory: "
330 <<
device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() <<
"\n";
331 msg <<
"\t\tDevice Available: "
332 <<
device.getInfo<CL_DEVICE_AVAILABLE>() <<
"\n";
334 }
catch (
const cl::Error&
e) {
339 msg <<
"\tno (OpenCL) devices in the platform with ID " << platform_id
363 inline cl::CommandQueue&
queue() noexcept {
393 inline std::vector<cl::Device>&
device() noexcept {
400 inline std::vector<cl::Platform>&
platform() noexcept {
421 *cache = cl::Kernel();
cl::CommandQueue command_queue_
std::unordered_map< std::string, int > map_base_opts
static void select_device(int platform_id, int device_id)
std::string platform_name_
std::vector< cl::Platform > platform_
std::vector< cl::Device > device_
size_t max_thread_block_size_
std::vector< cl::Device > devices_
std::vector< cl::Platform > platforms_
static opencl_context_base & getInstance() noexcept
struct stan::math::opencl_context_base::tuning_struct tuning_opts_
The opencl_context_base class represents an OpenCL context in the standard Meyers singleton design pa...
void register_kernel_cache(cl::Kernel *cache)
Registers a cached kernel.
tbb::concurrent_vector< cl::Kernel * > kernel_caches_
void select_device(int platform_id, int instance_id)
Selects the OpenCL device to use from now on.
The API to access the methods and values in opencl_context_base.
void check_opencl_error(const char *function, const cl::Error &e)
Throws the domain error with specifying the OpenCL error that occurred.
opencl_context_base(int platform_id=1, int device_id=1)
Construct the opencl_context by initializing the OpenCL context, devices, command queues,...
std::string capabilities() const
Returns the description of the OpenCL platforms and devices that are available.
std::string description() const
Returns the description of the OpenCL platform and device that is used.
opencl_context_base::map_base_opts & base_opts() noexcept
Returns a copy of the map of kernel defines.
bool & in_order() noexcept
Return a bool representing whether the write to the OpenCL device are blocking.
cl::Context & context() noexcept
Returns the reference to the OpenCL context.
std::vector< cl::Platform > & platform() noexcept
Returns a vector containing the OpenCL platform used to create the context.
std::vector< cl::Device > & device() noexcept
Returns a vector containing the OpenCL device used to create the context.
opencl_context_base::tuning_struct & tuning_opts() noexcept
Returns the thread block size for the Cholesky Decompositions L_11.
cl::CommandQueue & queue() noexcept
Returns the reference to the active OpenCL command queue for the device.
int max_thread_block_size() noexcept
Returns the maximum thread block size defined by CL_DEVICE_MAX_WORK_GROUP_SIZE for the device in the ...
static constexpr double e()
Return the base of the natural logarithm.
void system_error(const char *function, const char *name, const int &y, const char *msg1, const char *msg2)
Throw a system error with a consistently formatted message.
The lgamma implementation in stan-math is based on either the reentrant safe lgamma_r implementation ...
int cholesky_rev_block_partition
int tri_inverse_size_worth_transfer
int multiply_dim_prod_worth_transfer
int cholesky_size_worth_transfer
int cholesky_rev_min_block_size
int multiply_wgs_per_compute_unit
double gp_exp_quad_cov_complex
double gp_exp_quad_cov_simple
int cholesky_min_L11_size