Automatic Differentiation
 
Loading...
Searching...
No Matches
opencl_context.hpp
Go to the documentation of this file.
1#ifndef STAN_MATH_OPENCL_OPENCL_CONTEXT_HPP
2#define STAN_MATH_OPENCL_OPENCL_CONTEXT_HPP
3#ifdef STAN_OPENCL
4
5#define DEVICE_FILTER CL_DEVICE_TYPE_ALL
6#ifndef OPENCL_DEVICE_ID
7#error OPENCL_DEVICE_ID_NOT_SET
8#endif
9#ifndef OPENCL_PLATFORM_ID
10#error OPENCL_PLATFORM_ID_NOT_SET
11#endif
12
15
16#include <CL/opencl.hpp>
17#include <tbb/concurrent_vector.h>
18#include <string>
19#include <iostream>
20#include <fstream>
21#include <unordered_map>
22#include <vector>
23#include <cmath>
24#include <cerrno>
25
34namespace stan {
35namespace math {
36
55 friend class opencl_context;
56
57 private:
75 opencl_context_base(int platform_id = OPENCL_PLATFORM_ID,
76 int device_id = OPENCL_DEVICE_ID) {
77 try {
78 // platform
79 cl::Platform::get(&platforms_);
80 if (platform_id >= platforms_.size()) {
81 system_error("OpenCL Initialization", "[Platform]", -1,
82 "CL_INVALID_PLATFORM");
83 }
84 platform_.push_back(platforms_[platform_id]);
85 platform_name_ = platform_[0].getInfo<CL_PLATFORM_NAME>();
86 platform_[0].getDevices(DEVICE_FILTER, &devices_);
87 if (devices_.size() == 0) {
88 system_error("OpenCL Initialization", "[Device]", -1,
89 "CL_DEVICE_NOT_FOUND");
90 }
91 if (device_id >= devices_.size()) {
92 system_error("OpenCL Initialization", "[Device]", -1,
93 "CL_INVALID_DEVICE");
94 }
95 device_.push_back(devices_[device_id]);
96 // context and queue
97 cl_command_queue_properties device_properties;
98 device_[0].getInfo<cl_command_queue_properties>(
99 CL_DEVICE_QUEUE_PROPERTIES, &device_properties);
100 device_[0].getInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE,
102 std::vector<size_t> max_wg_sizes
103 = device_[0].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
104 if (max_wg_sizes.size() < 3) {
105 system_error("OpenCL Initialization", "[Device]", -1,
106 "The device does not support 3D work groups!");
107 }
108
109 context_ = cl::Context(device_[0]);
110 if (device_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
112 = cl::CommandQueue(context_, device_[0],
113 CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, nullptr);
114 in_order_ = CL_FALSE;
115 } else {
116 command_queue_ = cl::CommandQueue(context_, device_[0], 0, nullptr);
117 in_order_ = CL_TRUE;
118 }
119 int max_square_block_size
120 = std::min({max_wg_sizes[0], max_wg_sizes[1],
121 static_cast<size_t>(std::sqrt(max_thread_block_size_))});
122
123 // Does a compile time check of the maximum allowed
124 // dimension of a square thread block size
125 // WG size of (32,32) works on all recent GPUs but would fail on some
126 // older integrated GPUs or CPUs
127 if (max_square_block_size < base_opts_["THREAD_BLOCK_SIZE"]) {
128 base_opts_["THREAD_BLOCK_SIZE"] = max_square_block_size;
129 base_opts_["WORK_PER_THREAD"] = 1;
130 }
131 if (std::min(max_thread_block_size_, max_wg_sizes[0])
132 < base_opts_["LOCAL_SIZE_"]) {
133 // must be a power of base_opts_["REDUCTION_STEP_SIZE"]
134 const int p = std::log(max_thread_block_size_)
135 / std::log(base_opts_["REDUCTION_STEP_SIZE"]);
136 base_opts_["LOCAL_SIZE_"]
137 = std::pow(base_opts_["REDUCTION_STEP_SIZE"], p);
138 }
139 // Thread block size for the Cholesky
140 // TODO(Steve): This should be tuned in a higher part of the stan language
141 if (max_thread_block_size_ >= 256) {
143 } else {
145 }
146 } catch (const cl::Error& e) {
147 check_opencl_error("opencl_context", e);
148 }
149 }
150
151 protected:
152 cl::Context context_; // Manages the the device, queue, platform, memory, etc
153 cl::CommandQueue command_queue_; // job queue for device, one per device
154 std::vector<cl::Platform> platforms_; // Vector of available platforms
155 std::vector<cl::Platform> platform_; // The platform for compiling kernels
156 std::string platform_name_; // The platform such as NVIDIA OpenCL or AMD SDK
157 std::vector<cl::Device> device_; // The selected OpenCL device
158 std::vector<cl::Device> devices_; // All available OpenCL devices
159 std::string device_name_; // The name of OpenCL device
160 size_t max_thread_block_size_; // The maximum size of a block of workers on
161 // the device
162 bool in_order_; // Whether to use out of order execution.
163 // Holds Default parameter values for each Kernel.
164 using map_base_opts = std::unordered_map<std::string, int>;
166 = {{"LOWER", static_cast<int>(matrix_cl_view::Lower)},
167 {"UPPER", static_cast<int>(matrix_cl_view::Upper)},
168 {"ENTIRE", static_cast<int>(matrix_cl_view::Entire)},
169 {"DIAGONAL", static_cast<int>(matrix_cl_view::Diagonal)},
170 {"THREAD_BLOCK_SIZE", 32},
171 {"WORK_PER_THREAD", 8},
172 {"REDUCTION_STEP_SIZE", 4},
173 {"LOCAL_SIZE_", 4096}};
174 // TODO(Steve): Make these tunable during warmup
176 // Used in math/opencl/cholesky_decompose
180 // Used in math/rev/fun/cholesky_decompose
183 // used in math/opencl/multiply
185 // used in math/prim/fun/gp_exp_quad_cov
186 double gp_exp_quad_cov_complex = 1'000'000;
188 // used in math/prim/fun/multiply
189 // and math/rev/fun/multiply
191 // used in math/prim/fun/mdivide_left_tri
192 // and math/rev/fun/mdivide_left_tri
195
196 protected:
197 static opencl_context_base& getInstance() noexcept {
198 static opencl_context_base instance_;
199 return instance_;
200 }
201
202 static void select_device(int platform_id, int device_id) {
203 getInstance() = opencl_context_base(platform_id, device_id);
204 }
205};
206
211 tbb::concurrent_vector<cl::Kernel*> kernel_caches_;
212
213 public:
214 opencl_context() = default;
215
221 inline std::string description() const {
222 std::ostringstream msg;
223
224 msg << "Platform ID: " << OPENCL_DEVICE_ID << "\n";
225 msg << "Platform Name: "
227 .platform_[0]
228 .getInfo<CL_PLATFORM_NAME>()
229 << "\n";
230 msg << "Platform Vendor: "
232 .platform_[0]
233 .getInfo<CL_PLATFORM_VENDOR>()
234 << "\n";
235 msg << "\tDevice " << OPENCL_DEVICE_ID << ": "
236 << "\n";
237 msg << "\t\tDevice Name: "
239 .device_[0]
240 .getInfo<CL_DEVICE_NAME>()
241 << "\n";
242 msg << "\t\tDevice Type: "
244 .device_[0]
245 .getInfo<CL_DEVICE_TYPE>()
246 << "\n";
247 msg << "\t\tDevice Vendor: "
249 .device_[0]
250 .getInfo<CL_DEVICE_VENDOR>()
251 << "\n";
252 msg << "\t\tDevice Max Compute Units: "
254 .device_[0]
255 .getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()
256 << "\n";
257 msg << "\t\tDevice Global Memory: "
259 .device_[0]
260 .getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()
261 << "\n";
262 msg << "\t\tDevice Max Clock Frequency: "
264 .device_[0]
265 .getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>()
266 << "\n";
267 msg << "\t\tDevice Max Allocateable Memory: "
269 .device_[0]
270 .getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>()
271 << "\n";
272 msg << "\t\tDevice Local Memory: "
274 .device_[0]
275 .getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()
276 << "\n";
277 msg << "\t\tDevice Available: "
279 .device_[0]
280 .getInfo<CL_DEVICE_AVAILABLE>()
281 << "\n";
282 return msg.str();
283 }
284
290 inline std::string capabilities() const {
291 std::vector<cl::Platform> all_platforms;
292 cl::Platform::get(&all_platforms);
293 std::ostringstream msg;
294 int platform_id = 0;
295 int device_id = 0;
296
297 msg << "Number of Platforms: " << all_platforms.size() << "\n";
298 for (auto plat_iter : all_platforms) {
299 cl::Platform platform(plat_iter);
300
301 msg << "Platform ID: " << platform_id++ << "\n";
302 msg << "Platform Name: " << platform.getInfo<CL_PLATFORM_NAME>() << "\n";
303 msg << "Platform Vendor: " << platform.getInfo<CL_PLATFORM_VENDOR>()
304 << "\n";
305
306 try {
307 std::vector<cl::Device> all_devices;
308 platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
309
310 for (auto device_iter : all_devices) {
311 cl::Device device(device_iter);
312
313 msg << "\tDevice " << device_id++ << ": "
314 << "\n";
315 msg << "\t\tDevice Name: " << device.getInfo<CL_DEVICE_NAME>()
316 << "\n";
317 msg << "\t\tDevice Type: " << device.getInfo<CL_DEVICE_TYPE>()
318 << "\n";
319 msg << "\t\tDevice Vendor: " << device.getInfo<CL_DEVICE_VENDOR>()
320 << "\n";
321 msg << "\t\tDevice Max Compute Units: "
322 << device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << "\n";
323 msg << "\t\tDevice Global Memory: "
324 << device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() << "\n";
325 msg << "\t\tDevice Max Clock Frequency: "
326 << device.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>() << "\n";
327 msg << "\t\tDevice Max Allocateable Memory: "
328 << device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>() << "\n";
329 msg << "\t\tDevice Local Memory: "
330 << device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() << "\n";
331 msg << "\t\tDevice Available: "
332 << device.getInfo<CL_DEVICE_AVAILABLE>() << "\n";
333 }
334 } catch (const cl::Error& e) {
335 // if one of the platforms have no devices that match the device type
336 // it will throw the error == -1 (DEVICE_NOT_FOUND)
337 // other errors will throw a system error
338 if (e.err() == -1) {
339 msg << "\tno (OpenCL) devices in the platform with ID " << platform_id
340 << "\n";
341 } else {
342 check_opencl_error("capabilities", e);
343 }
344 }
345 }
346 return msg.str();
347 }
348
355 inline cl::Context& context() noexcept {
357 }
363 inline cl::CommandQueue& queue() noexcept {
365 }
371 }
379 inline int max_thread_block_size() noexcept {
381 }
382
388 }
389
393 inline std::vector<cl::Device>& device() noexcept {
395 }
396
400 inline std::vector<cl::Platform>& platform() noexcept {
402 }
407 inline bool& in_order() noexcept {
409 }
410
419 inline void select_device(int platform_id, int instance_id) {
420 for (cl::Kernel* cache : kernel_caches_) {
421 *cache = cl::Kernel();
422 }
423 kernel_caches_.clear();
424 opencl_context_base::select_device(platform_id, instance_id);
425 }
426
432 inline void register_kernel_cache(cl::Kernel* cache) {
433 kernel_caches_.push_back(cache);
434 }
435};
437} // namespace math
438} // namespace stan
439
440#endif
441#endif
std::unordered_map< std::string, int > map_base_opts
static void select_device(int platform_id, int device_id)
std::vector< cl::Platform > platform_
std::vector< cl::Device > device_
std::vector< cl::Device > devices_
std::vector< cl::Platform > platforms_
static opencl_context_base & getInstance() noexcept
struct stan::math::opencl_context_base::tuning_struct tuning_opts_
The opencl_context_base class represents an OpenCL context in the standard Meyers singleton design pa...
void register_kernel_cache(cl::Kernel *cache)
Registers a cached kernel.
tbb::concurrent_vector< cl::Kernel * > kernel_caches_
void select_device(int platform_id, int instance_id)
Selects the OpenCL device to use from now on.
The API to access the methods and values in opencl_context_base.
void check_opencl_error(const char *function, const cl::Error &e)
Throws the domain error with specifying the OpenCL error that occurred.
opencl_context_base(int platform_id=1, int device_id=1)
Construct the opencl_context by initializing the OpenCL context, devices, command queues,...
std::string capabilities() const
Returns the description of the OpenCL platforms and devices that are available.
std::string description() const
Returns the description of the OpenCL platform and device that is used.
opencl_context_base::map_base_opts & base_opts() noexcept
Returns a copy of the map of kernel defines.
bool & in_order() noexcept
Return a bool representing whether the write to the OpenCL device are blocking.
cl::Context & context() noexcept
Returns the reference to the OpenCL context.
std::vector< cl::Platform > & platform() noexcept
Returns a vector containing the OpenCL platform used to create the context.
std::vector< cl::Device > & device() noexcept
Returns a vector containing the OpenCL device used to create the context.
opencl_context_base::tuning_struct & tuning_opts() noexcept
Returns the thread block size for the Cholesky Decompositions L_11.
cl::CommandQueue & queue() noexcept
Returns the reference to the active OpenCL command queue for the device.
int max_thread_block_size() noexcept
Returns the maximum thread block size defined by CL_DEVICE_MAX_WORK_GROUP_SIZE for the device in the ...
static constexpr double e()
Return the base of the natural logarithm.
Definition constants.hpp:20
void system_error(const char *function, const char *name, const int &y, const char *msg1, const char *msg2)
Throw a system error with a consistently formatted message.
The lgamma implementation in stan-math is based on either the reentrant safe lgamma_r implementation ...
#define DEVICE_FILTER