math/opencl_2prim_2multiply_8hpp_source.html

#ifndef STAN_MATH_OPENCL_PRIM_MULTIPLY_HPP

#define STAN_MATH_OPENCL_PRIM_MULTIPLY_HPP

#ifdef STAN_OPENCL


#include <stan/math/opencl/matrix_cl.hpp>

#include <stan/math/opencl/err.hpp>

#include <stan/math/opencl/kernel_generator.hpp>

#include <stan/math/opencl/kernels/matrix_multiply.hpp>

#include <stan/math/opencl/kernels/add.hpp>

#include <stan/math/opencl/scalar_type.hpp>

#include <stan/math/prim/fun/Eigen.hpp>

#include <stan/math/prim/meta.hpp>

#include <algorithm>


namespace stan {

namespace math {


template <typename T1, typename T2,

          typename = require_all_kernel_expressions_and_none_scalar_t<T1, T2>>

inline matrix_cl<return_type_t<T1, T2>> multiply(const T1& A, const T2& B) {

  check_size_match("multiply ((OpenCL))", "A.cols()", A.cols(), "B.rows()",

                   B.rows());

  if (A.size() == 0 || B.size() == 0) {

    return constant(0.0, A.rows(), B.cols());

  }

  matrix_cl<return_type_t<T1, T2>> temp(A.rows(), B.cols(),

                                        either(A.view(), B.view()));

  if (A.rows() == 1) {

    const int local_size

        = opencl_kernels::row_vector_matrix_multiply.get_option("LOCAL_SIZE_");

    try {

      opencl_kernels::row_vector_matrix_multiply(

          cl::NDRange(temp.cols() * local_size), cl::NDRange(local_size),

          A.eval(), B.eval(), temp, B.rows(), B.cols(), A.view(), B.view());

    } catch (cl::Error& e) {

      check_opencl_error("row_vector - matrix multiply", e);

    }

    return temp;

  }

  if (B.cols() == 1) {

    temp = matrix_vector_multiply(A, B);

    return temp;

  }

  int local = opencl_kernels::matrix_multiply.get_option("THREAD_BLOCK_SIZE");

  const int Mpad = ((A.rows() + local - 1) / local) * local;

  const int Npad = ((B.cols() + local - 1) / local) * local;

  const int wpt = opencl_kernels::matrix_multiply.get_option("WORK_PER_THREAD");

  const int wgs = Mpad / local * Npad / local;

  const int split = std::min(

      A.cols() / local,

      (opencl_context.tuning_opts().multiply_wgs_per_compute_unit

           * static_cast<int>(opencl_context.device()[0]

                                  .getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>())

       + wgs - 1)

          / wgs);

  try {

    if (split <= 1) {

      opencl_kernels::matrix_multiply(cl::NDRange(Mpad, Npad / wpt),

                                      cl::NDRange(local, local / wpt), A.eval(),

                                      B.eval(), temp, A.rows(), B.cols(),

                                      B.rows(), A.view(), B.view());

    } else {

      matrix_cl<return_type_t<T1, T2>> tempSplit(A.rows(), B.cols() * split);

      opencl_kernels::matrix_multiply(cl::NDRange(Mpad, Npad / wpt, split),

                                      cl::NDRange(local, local / wpt, 1),

                                      A.eval(), B.eval(), tempSplit, A.rows(),

                                      B.cols(), B.rows(), A.view(), B.view());

      opencl_kernels::add_batch(cl::NDRange(A.rows(), B.cols()), temp,

                                tempSplit, A.rows(), B.cols(), split);

    }

  } catch (cl::Error& e) {

    check_opencl_error("multiply", e);

  }

  return temp;

}


template <typename T_a, typename T_b,

          typename = require_all_kernel_expressions_and_none_scalar_t<T_a, T_b>>

inline matrix_cl<return_type_t<T_a, T_b>> operator*(const T_a& a,

                                                    const T_b& b) {

  // no need for perfect forwarding as operations are evaluated

  return multiply(as_operation_cl(a).eval(), as_operation_cl(b).eval());

}


template <typename T_a, typename T_b, require_stan_scalar_t<T_a>* = nullptr,

          require_all_kernel_expressions_and_none_scalar_t<T_b>* = nullptr,

          require_all_not_var_t<T_a, T_b>* = nullptr>

inline matrix_cl<return_type_t<T_a, T_b>> multiply(const T_a& a, const T_b& b) {

  return a * b;

}


template <typename T_a, typename T_b, require_stan_scalar_t<T_b>* = nullptr,

          require_all_kernel_expressions_and_none_scalar_t<T_a>* = nullptr,

          require_all_not_var_t<T_a, T_b>* = nullptr>

inline matrix_cl<return_type_t<T_a, T_b>> multiply(const T_a& a, const T_b& b) {

  return a * b;

}


}  // namespace math

}  // namespace stan

#endif

#endif

Eigen.hpp

stan::math::matrix_cl::cols
int cols() const
Definition matrix_cl.hpp:66

stan::math::matrix_cl::rows
int rows() const
Definition matrix_cl.hpp:64

stan::math::matrix_cl
Represents an arithmetic matrix on the OpenCL device.
Definition matrix_cl.hpp:47

stan::math::opencl_context
The API to access the methods and values in opencl_context_base.
Definition opencl_context.hpp:210

stan::math::check_opencl_error
void check_opencl_error(const char *function, const cl::Error &e)
Throws the domain error with specifying the OpenCL error that occurred.
Definition check_opencl.hpp:23

stan::math::opencl_context::device
std::vector< cl::Device > & device() noexcept
Returns a vector containing the OpenCL device used to create the context.
Definition opencl_context.hpp:393

stan::math::opencl_context::tuning_opts
opencl_context_base::tuning_struct & tuning_opts() noexcept
Returns the thread block size for the Cholesky Decompositions L_11.
Definition opencl_context.hpp:386

stan::math::constant
auto constant(const T a, int rows, int cols)
Matrix of repeated values in kernel generator expressions.
Definition constant.hpp:130

stan::math::as_operation_cl
T_operation && as_operation_cl(T_operation &&a)
Converts any valid kernel generator expression into an operation.
Definition as_operation_cl.hpp:31

stan::require_all_kernel_expressions_and_none_scalar_t
require_all_t< is_kernel_expression_and_not_scalar< Types >... > require_all_kernel_expressions_and_none_scalar_t
Enables a template if all given types are non-scalar types that are a valid kernel generator expressi...
Definition is_kernel_expression.hpp:58

stan::math::opencl_kernels::row_vector_matrix_multiply
const kernel_cl< in_buffer, in_buffer, out_buffer, int, int, matrix_cl_view, matrix_cl_view > row_vector_matrix_multiply("row_vector_matrix_multiply", {view_kernel_helpers, row_vector_matrix_multiply_kernel_code}, {{"LOCAL_SIZE_", 64}, {"REDUCTION_STEP_SIZE", 4}})
See the docs for row_vector_matrix_multiply() .

stan::math::opencl_kernels::add_batch
const kernel_cl< out_buffer, in_buffer, int, int, int > add_batch("add_batch", {indexing_helpers, add_batch_kernel_code})
See the docs for add_batch() .

stan::math::opencl_kernels::matrix_multiply
const kernel_cl< in_buffer, in_buffer, out_buffer, int, int, int, matrix_cl_view, matrix_cl_view > matrix_multiply("matrix_multiply", {thread_block_helpers, view_kernel_helpers, matrix_multiply_kernel_code}, {{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 8}})
See the docs for matrix_multiply() .

stan::math::either
const matrix_cl_view either(const matrix_cl_view left_view, const matrix_cl_view right_view)
Determines which parts are nonzero in any of the input views.
Definition matrix_cl_view.hpp:19

kernel_generator.hpp

matrix_cl.hpp

matrix_multiply.hpp

stan::math::operator*
fvar< T > operator*(const fvar< T > &x, const fvar< T > &y)
Return the product of the two arguments.
Definition operator_multiplication.hpp:20

stan::math::e
static constexpr double e()
Return the base of the natural logarithm.
Definition constants.hpp:20

stan::math::eval
T eval(T &&arg)
Inputs which have a plain_type equal to the own time are forwarded unmodified (for Eigen expressions ...
Definition eval.hpp:20

stan::math::multiply
auto multiply(const Mat1 &m1, const Mat2 &m2)
Return the product of the specified matrices.
Definition multiply.hpp:19

stan::math::matrix_vector_multiply
auto matrix_vector_multiply(T_matrix &&matrix, T_vector &&vector)
Multiplies a matrix and a vector on an OpenCL device.
Definition matrix_vector_multiply.hpp:27

stan::math::check_size_match
void check_size_match(const char *function, const char *name_i, T_size1 i, const char *name_j, T_size2 j)
Check if the provided sizes match.
Definition check_size_match.hpp:24

stan
The lgamma implementation in stan-math is based on either the reentrant safe lgamma_r implementation ...
Definition unit_vector_constrain.hpp:15

err.hpp

add.hpp

scalar_type.hpp

meta.hpp

stan::math::opencl_context_base::tuning_struct::multiply_wgs_per_compute_unit
int multiply_wgs_per_compute_unit
Definition opencl_context.hpp:184