Automatic Differentiation
 
Loading...
Searching...
No Matches
matrix_cl.hpp
Go to the documentation of this file.
1#ifndef STAN_MATH_OPENCL_MATRIX_CL_HPP
2#define STAN_MATH_OPENCL_MATRIX_CL_HPP
3#ifdef STAN_OPENCL
4
14#include <CL/opencl.hpp>
15#include <tbb/concurrent_vector.h>
16#include <algorithm>
17#include <iostream>
18#include <string>
19#include <type_traits>
20#include <vector>
21
28namespace stan {
29namespace math {
30
35// forward declare
36template <typename T>
37class arena_matrix_cl;
38
39template <typename>
40class matrix_cl;
41
46template <typename T>
47class matrix_cl : public matrix_cl_base {
48 private:
49 cl::Buffer buffer_cl_; // Holds the allocated memory on the device
50 int rows_{0}; // Number of rows.
51 int cols_{0}; // Number of columns.
52 // Holds info on if matrix is a special type
54 mutable tbb::concurrent_vector<cl::Event> write_events_; // Tracks write jobs
55 mutable tbb::concurrent_vector<cl::Event> read_events_; // Tracks reads
56
57 public:
58 using Scalar = T; // Underlying type of the matrix
59 using type = T; // Underlying type of the matrix
60 // Forward declare the methods that work in place on the matrix
61 template <matrix_cl_view matrix_view = matrix_cl_view::Entire>
62 inline void zeros_strict_tri();
63
64 int rows() const { return rows_; }
65
66 int cols() const { return cols_; }
67
68 int size() const { return rows_ * cols_; }
69
70 const matrix_cl_view& view() const { return view_; }
71
72 void view(const matrix_cl_view& view) { view_ = view; }
73
77 inline void clear_write_events() const {
78 write_events_.clear();
79 return;
80 }
81
85 inline void clear_read_events() const {
86 read_events_.clear();
87 return;
88 }
89
93 inline void clear_read_write_events() const {
94 read_events_.clear();
95 write_events_.clear();
96 return;
97 }
98
103 inline const tbb::concurrent_vector<cl::Event>& write_events() const {
104 return write_events_;
105 }
106
111 inline const tbb::concurrent_vector<cl::Event>& read_events() const {
112 return read_events_;
113 }
114
119 inline const tbb::concurrent_vector<cl::Event> read_write_events() const {
120 return vec_concat(this->read_events(), this->write_events());
121 }
122
127 inline void add_read_event(cl::Event new_event) const {
128 this->read_events_.push_back(new_event);
129 }
130
135 inline void add_write_event(cl::Event new_event) const {
136 this->write_events_.push_back(new_event);
137 }
138
143 inline void add_read_write_event(cl::Event new_event) const {
144 this->read_events_.push_back(new_event);
145 this->write_events_.push_back(new_event);
146 }
147
151 inline void wait_for_write_events() const {
152 for (cl::Event e : write_events_) {
153 e.wait();
154 }
155 write_events_.clear();
156 }
157
161 inline void wait_for_read_events() const {
162 for (cl::Event e : read_events_) {
163 e.wait();
164 }
165 read_events_.clear();
166 }
167
172 inline void wait_for_read_write_events() const {
175 }
176
177 const cl::Buffer& buffer() const { return buffer_cl_; }
178 cl::Buffer& buffer() { return buffer_cl_; }
179
190 matrix_cl(const cl::Buffer& A, const int R, const int C,
192 : buffer_cl_(A), rows_(R), cols_(C), view_(partial_view) {}
193
199 : rows_(A.rows()), cols_(A.cols()), view_(A.view()) {
200 if (A.size() == 0) {
201 return;
202 }
203 buffer_cl_ = cl::Buffer(opencl_context.context(), CL_MEM_READ_WRITE,
204 sizeof(T) * this->size());
206 }
207
213 : buffer_cl_(std::move(A.buffer_cl_)),
214 rows_(A.rows_),
215 cols_(A.cols_),
216 view_(A.view_),
218 read_events_(std::move(A.read_events_)) {}
219
224 // defined in rev/arena_matrix_cl.hpp
225 matrix_cl(const arena_matrix_cl<T>& A); // NOLINT(runtime/explicit)
226
246 template <typename Vec, require_std_vector_vt<is_eigen, Vec>* = nullptr,
247 require_st_same<Vec, T>* = nullptr>
248 explicit matrix_cl(Vec&& A) try : rows_(A.empty() ? 0 : A[0].size()),
249 cols_(A.size()) {
250 if (this->size() == 0) {
251 return;
252 }
253 cl::Context& ctx = opencl_context.context();
254 cl::CommandQueue& queue = opencl_context.queue();
255 buffer_cl_ = cl::Buffer(ctx, CL_MEM_READ_WRITE, sizeof(T) * size());
256 for (int i = 0, offset_size = 0; i < cols_; i++, offset_size += rows_) {
257 check_size_match("matrix constructor", "input rows", A[i].size(),
258 "matrix_cl rows", rows_);
259 cl::Event write_event;
260 queue.enqueueWriteBuffer(
262 opencl_context.in_order() || std::is_rvalue_reference<Vec&&>::value,
263 sizeof(T) * offset_size, sizeof(T) * rows_, A[i].data(), nullptr,
264 &write_event);
265 this->add_write_event(write_event);
266 }
267 } catch (const cl::Error& e) {
268 check_opencl_error("matrix constructor", e);
269 }
270
284 matrix_cl(const int rows, const int cols,
286 : rows_(rows), cols_(cols), view_(partial_view) {
287 if (this->size() == 0) {
288 return;
289 }
290 cl::Context& ctx = opencl_context.context();
291 try {
292 int flags = CL_MEM_READ_WRITE;
293 if (opencl_context.device()[0].getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>()) {
294 flags |= CL_MEM_ALLOC_HOST_PTR;
295 }
296 buffer_cl_ = cl::Buffer(ctx, flags, sizeof(T) * rows_ * cols_);
297 } catch (const cl::Error& e) {
298 check_opencl_error("matrix constructor", e);
299 }
300 }
301
319 template <typename Mat, require_eigen_t<Mat>* = nullptr,
320 require_vt_same<Mat, T>* = nullptr>
321 explicit matrix_cl(Mat&& A,
323 : rows_(A.rows()), cols_(A.cols()), view_(partial_view) {
324 using Mat_type = std::decay_t<ref_type_for_opencl_t<Mat>>;
325 if (this->size() == 0) {
326 return;
327 }
329 std::is_same<std::decay_t<Mat>, Mat_type>::value
330 && (std::is_lvalue_reference<Mat>::value
332 }
333
349 template <typename Scal,
351 explicit matrix_cl(Scal&& A,
353 : rows_(1), cols_(1), view_(partial_view) {
354 initialize_buffer<std::is_rvalue_reference<Scal&&>::value>(
355 const_cast<const std::decay_t<Scal>*>(&A));
356 }
357
372 template <typename Vec, require_std_vector_t<Vec>* = nullptr,
373 require_vt_same<Vec, T>* = nullptr>
374 explicit matrix_cl(Vec&& A,
376 : matrix_cl(std::forward<Vec>(A), A.size(), 1) {}
377
394 template <typename Vec, require_std_vector_t<Vec>* = nullptr,
395 require_vt_same<Vec, T>* = nullptr>
396 explicit matrix_cl(Vec&& A, const int& R, const int& C,
398 : rows_(R), cols_(C), view_(partial_view) {
399 initialize_buffer_no_heap_if<std::is_lvalue_reference<Vec>::value>(A);
400 }
401
417 template <typename U, require_same_t<T, U>* = nullptr>
418 explicit matrix_cl(const U* A, const int& R, const int& C,
420 : rows_(R), cols_(C), view_(partial_view) {
422 }
423
430 // defined in kernel_generator/matrix_cl_conversion.hpp
431 template <typename Expr,
434 matrix_cl(const Expr& expression); // NOLINT(runtime/explicit)
435
440 view_ = a.view();
441 rows_ = a.rows();
442 cols_ = a.cols();
444 buffer_cl_ = std::move(a.buffer_cl_);
445 write_events_ = std::move(a.write_events_);
446 read_events_ = std::move(a.read_events_);
447 return *this;
448 }
449
454 this->view_ = a.view();
455 if (a.size() == 0) {
456 this->rows_ = a.rows();
457 this->cols_ = a.cols();
458 return *this;
459 }
461 if (this->size() != a.size()) {
462 buffer_cl_ = cl::Buffer(opencl_context.context(), CL_MEM_READ_WRITE,
463 sizeof(T) * a.size());
464 }
465 this->rows_ = a.rows();
466 this->cols_ = a.cols();
468 return *this;
469 }
470
477 // defined in kernel_generator/matrix_cl_conversion.hpp
478 template <typename Expr,
481 matrix_cl<T>& operator=(const Expr& expression);
482
488 // defined in rev/arena_matrix_cl.hpp
490
495 const matrix_cl<T>& eval() const& { return *this; }
496 matrix_cl<T> eval() && { return std::move(*this); }
497
503
507 void setZero() {
508 if (this->size() == 0) {
509 return;
510 }
511 cl::Event zero_event;
512 const std::size_t write_events_size = this->write_events().size();
513 const std::size_t read_events_size = this->read_events().size();
514 const std::size_t read_write_size = write_events_size + read_events_size;
515 std::vector<cl::Event> read_write_events(read_write_size, cl::Event{});
516 auto&& read_events_vec = this->read_events();
517 auto&& write_events_vec = this->write_events();
518 for (std::size_t i = 0; i < read_events_size; ++i) {
519 read_write_events[i] = read_events_vec[i];
520 }
521 for (std::size_t i = read_events_size, j = 0; j < write_events_size;
522 ++i, ++j) {
523 read_write_events[i] = write_events_vec[j];
524 }
525 try {
526 opencl_context.queue().enqueueFillBuffer(buffer_cl_, static_cast<T>(0), 0,
527 sizeof(T) * this->size(),
528 &read_write_events, &zero_event);
529 } catch (const cl::Error& e) {
530 check_opencl_error("setZero", e);
531 }
532 this->add_write_event(zero_event);
533 }
534
535 private:
550 template <bool in_order = false>
551 cl::Event initialize_buffer(const T* A) {
552 cl::Event transfer_event;
553 if (this->size() == 0) {
554 return transfer_event;
555 }
556 cl::Context& ctx = opencl_context.context();
557 cl::CommandQueue& queue = opencl_context.queue();
558 try {
559 buffer_cl_ = cl::Buffer(ctx, CL_MEM_READ_WRITE, sizeof(T) * size());
560 queue.enqueueWriteBuffer(buffer_cl_,
561 opencl_context.in_order() || in_order, 0,
562 sizeof(T) * size(), A, nullptr, &transfer_event);
563 this->add_write_event(transfer_event);
564 } catch (const cl::Error& e) {
565 check_opencl_error("initialize_buffer", e);
566 }
567 return transfer_event;
568 }
569
570 template <bool in_order = false>
571 cl::Event initialize_buffer(T* A) {
572 cl::Event transfer_event;
573 if (this->size() == 0) {
574 return transfer_event;
575 }
576 cl::Context& ctx = opencl_context.context();
577 cl::CommandQueue& queue = opencl_context.queue();
578 try {
579 if (opencl_context.device()[0].getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>()) {
580 constexpr auto copy_or_share
581 = CL_MEM_COPY_HOST_PTR * INTEGRATED_OPENCL
582 | (CL_MEM_USE_HOST_PTR * !INTEGRATED_OPENCL);
584 = cl::Buffer(ctx, CL_MEM_READ_WRITE | copy_or_share,
585 sizeof(T) * size(), A); // this is always synchronous
586 } else {
587 buffer_cl_ = cl::Buffer(ctx, CL_MEM_READ_WRITE, sizeof(T) * size());
588 queue.enqueueWriteBuffer(
589 buffer_cl_, opencl_context.in_order() || in_order, 0,
590 sizeof(T) * size(), A, nullptr, &transfer_event);
591 this->add_write_event(transfer_event);
592 }
593 } catch (const cl::Error& e) {
594 check_opencl_error("initialize_buffer", e);
595 }
596 return transfer_event;
597 }
598
613 template <bool No_heap, typename U, std::enable_if_t<No_heap>* = nullptr>
615 if (this->size() == 0) {
616 return;
617 }
618 initialize_buffer(obj.data());
619 }
620 // we need separate overloads as obj.data() might not be available when second
621 // overload is called.
622 template <bool No_heap, typename U, std::enable_if_t<!No_heap>* = nullptr>
624 using U_val = std::decay_t<ref_type_for_opencl_t<U>>;
625 if (this->size() == 0) {
626 return;
627 }
628 auto* obj_heap = new U_val(std::move(obj));
629 try {
630 cl::Event e = initialize_buffer(obj_heap->data());
631 if (opencl_context.device()[0].getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>()) {
632 buffer_cl_.setDestructorCallback(&delete_it_destructor<U_val>,
633 obj_heap);
634 } else {
635 e.setCallback(CL_COMPLETE, &delete_it_event<U_val>, obj_heap);
636 }
637 } catch (...) {
638 delete obj_heap;
639 throw;
640 }
641 }
642
650 cl::Event cstr_event;
651 std::vector<cl::Event>* dep_events = new std::vector<cl::Event>(
652 A.write_events().begin(), A.write_events().end());
653 try {
654 opencl_context.queue().enqueueCopyBuffer(A.buffer(), this->buffer(), 0, 0,
655 A.size() * sizeof(T), dep_events,
656 &cstr_event);
657 if (opencl_context.device()[0].getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>()) {
658 buffer_cl_.setDestructorCallback(
659 &delete_it_destructor<std::vector<cl::Event>>, dep_events);
660 } else {
661 cstr_event.setCallback(
662 CL_COMPLETE, &delete_it_event<std::vector<cl::Event>>, dep_events);
663 }
664 this->add_write_event(cstr_event);
665 A.add_read_event(cstr_event);
666 } catch (const cl::Error& e) {
667 delete dep_events;
668 check_opencl_error("copy (OpenCL)->(OpenCL)", e);
669 } catch (...) {
670 delete dep_events;
671 throw;
672 }
673 }
674
682 template <typename U>
683 static void delete_it_event(cl_event e, cl_int status, void* container) {
684 delete static_cast<U*>(container);
685 }
686
693 template <typename U>
694 static void delete_it_destructor(cl_mem buff, void* container) {
695 delete static_cast<U*>(container);
696 }
697};
700} // namespace math
701} // namespace stan
702
703#endif
704#endif
A variant of matrix_cl that schedules its destructor to be called, so it can be used on the AD stack.
Non-templated base class for matrix_cl simplifies checking if something is matrix_cl.
matrix_cl(Vec &&A, matrix_cl_view partial_view=matrix_cl_view::Entire)
Construct a matrix_cl of size Nx1 from std::vector.
void initialize_buffer_cl(const matrix_cl< T > &A)
Initializes the OpenCL buffer of this matrix by copying the data from given matrix_cl.
matrix_cl< T > & operator=(matrix_cl< T > &&a)
Move assignment operator.
matrix_cl(const int rows, const int cols, matrix_cl_view partial_view=matrix_cl_view::Entire)
Constructor for the matrix_cl that only allocates the buffer on the OpenCL device.
const cl::Buffer & buffer() const
const tbb::concurrent_vector< cl::Event > read_write_events() const
Get the events from the event stacks.
void wait_for_write_events() const
Waits for the write events and clears the read event stack.
void initialize_buffer_no_heap_if(U &&obj)
Initializes the OpenCL buffer of this matrix by copying the data from given object.
void add_read_event(cl::Event new_event) const
Add an event to the read event stack.
void wait_for_read_write_events() const
Waits for read and write events to finish and clears the read, write, and read/write event stacks.
void add_read_write_event(cl::Event new_event) const
Add an event to the read/write event stack.
void setZero()
Set the values of a matrix_cl to zero.
void wait_for_read_events() const
Waits for the read events and clears the read event stack.
cl::Buffer & buffer()
static void delete_it_destructor(cl_mem buff, void *container)
Deletes the container.
~matrix_cl()
Destructor waits for write events to prevent any kernels from writing memory that has already been re...
matrix_cl(const cl::Buffer &A, const int R, const int C, matrix_cl_view partial_view=matrix_cl_view::Entire)
Construct a matrix_cl<T> from an existing cl::Buffer object.
matrix_cl_view view_
Definition matrix_cl.hpp:53
matrix_cl(Vec &&A)
Constructor for the matrix_cl that creates a copy of a std::vector of Eigen matrices on the OpenCL de...
tbb::concurrent_vector< cl::Event > read_events_
Definition matrix_cl.hpp:55
tbb::concurrent_vector< cl::Event > write_events_
Definition matrix_cl.hpp:54
matrix_cl(const U *A, const int &R, const int &C, matrix_cl_view partial_view=matrix_cl_view::Entire)
Construct from array with given rows and columns.
matrix_cl(Vec &&A, const int &R, const int &C, matrix_cl_view partial_view=matrix_cl_view::Entire)
Construct from std::vector with given rows and columns.
matrix_cl< T > eval() &&
void view(const matrix_cl_view &view)
Definition matrix_cl.hpp:72
const matrix_cl_view & view() const
Definition matrix_cl.hpp:70
void clear_write_events() const
Clear the write events from the event stacks.
Definition matrix_cl.hpp:77
cl::Event initialize_buffer(T *A)
const matrix_cl< T > & eval() const &
Evaluates this.
static void delete_it_event(cl_event e, cl_int status, void *container)
Deletes the container.
const tbb::concurrent_vector< cl::Event > & read_events() const
Get the events from the event stacks.
void add_write_event(cl::Event new_event) const
Add an event to the write event stack.
matrix_cl(Scal &&A, matrix_cl_view partial_view=matrix_cl_view::Diagonal)
Constructor for the matrix_cl that creates a copy of a scalar on the OpenCL device.
matrix_cl< T > & operator=(const matrix_cl< T > &a)
Copy assignment operator.
matrix_cl(Mat &&A, matrix_cl_view partial_view=matrix_cl_view::Entire)
Constructor for the matrix_cl that creates a copy of the Eigen matrix or Eigen expression on the Open...
matrix_cl(const matrix_cl< T > &A)
Copy constructor.
matrix_cl(matrix_cl< T > &&A)
Move constructor.
void clear_read_write_events() const
Clear the write events from the event stacks.
Definition matrix_cl.hpp:93
cl::Event initialize_buffer(const T *A)
Initializes the OpenCL buffer of this matrix by copying the data from given buffer.
const tbb::concurrent_vector< cl::Event > & write_events() const
Get the events from the event stacks.
void clear_read_events() const
Clear the read events from the event stacks.
Definition matrix_cl.hpp:85
Represents an arithmetic matrix on the OpenCL device.
Definition matrix_cl.hpp:47
The API to access the methods and values in opencl_context_base.
void check_opencl_error(const char *function, const cl::Error &e)
Throws the domain error with specifying the OpenCL error that occurred.
void zeros_strict_tri()
Stores zeros in the strict's triangular part (excluding the diagonal) of a matrix on the OpenCL devic...
require_not_t< is_matrix_cl< std::decay_t< T > > > require_not_matrix_cl_t
Require type does not satisfy is_matrix_cl.
bool & in_order() noexcept
Return a bool representing whether the write to the OpenCL device are blocking.
cl::Context & context() noexcept
Returns the reference to the OpenCL context.
std::vector< cl::Device > & device() noexcept
Returns a vector containing the OpenCL device used to create the context.
cl::CommandQueue & queue() noexcept
Returns the reference to the active OpenCL command queue for the device.
require_all_t< is_kernel_expression_and_not_scalar< Types >... > require_all_kernel_expressions_and_none_scalar_t
Enables a template if all given types are non-scalar types that are a valid kernel generator expressi...
require_t< std::is_same< std::decay_t< T >, std::decay_t< S > > > require_same_t
Require types T and S satisfies std::is_same.
static constexpr double e()
Return the base of the natural logarithm.
Definition constants.hpp:20
auto vec_concat(const Vec &v1, const Args &... args)
Get the event stack from a vector of events and other arguments.
void check_size_match(const char *function, const char *name_i, T_size1 i, const char *name_j, T_size2 j)
Check if the provided sizes match.
The lgamma implementation in stan-math is based on either the reentrant safe lgamma_r implementation ...
STL namespace.
Check if a type is an Eigen::Map with contiguous stride.
Definition is_eigen.hpp:326