math/tri__inverse_8hpp_source.html

#ifndef STAN_MATH_OPENCL_TRI_INVERSE_HPP

#define STAN_MATH_OPENCL_TRI_INVERSE_HPP


#ifdef STAN_OPENCL


#include <stan/math/opencl/plain_type.hpp>

#include <stan/math/opencl/matrix_cl.hpp>

#include <stan/math/opencl/matrix_cl_view.hpp>

#include <stan/math/opencl/kernels/diag_inv.hpp>

#include <stan/math/opencl/kernels/inv_lower_tri_multiply.hpp>

#include <stan/math/opencl/kernels/neg_rect_lower_tri_multiply.hpp>

#include <stan/math/opencl/err.hpp>

#include <stan/math/opencl/kernels/batch_identity.hpp>

#include <stan/math/opencl/zeros_strict_tri.hpp>

#include <stan/math/opencl/kernel_generator.hpp>

#include <stan/math/prim/meta.hpp>

#include <cmath>

#include <string>

#include <vector>


namespace stan {

namespace math {

template <matrix_cl_view matrix_view = matrix_cl_view::Entire, typename T,

          require_matrix_cl_st<std::is_floating_point, T>* = nullptr>

inline plain_type_t<T> tri_inverse(const T& A) {

  check_square("tri_inverse (OpenCL)", "A", A);

  // if the triangular view is not specified use the triangularity of

  // the input matrix

  matrix_cl_view tri_view = matrix_view;

  if (matrix_view == matrix_cl_view::Entire) {

    if (A.view() != matrix_cl_view::Diagonal) {

      check_triangular("tri_inverse (OpenCL)", "A", A);

    }

    tri_view = A.view();

  }

  if (tri_view == matrix_cl_view::Diagonal) {

    plain_type_t<T> inv_mat(A.rows(), A.cols());

    diagonal(inv_mat) = elt_divide(1.0, diagonal(A));

    return inv_mat;

  }


  int thread_block_2D_dim = 32;

  int max_1D_thread_block_size = opencl_context.max_thread_block_size();

  // we split the input matrix to 32 blocks

  int thread_block_size_1D

      = (((A.rows() / 32) + thread_block_2D_dim - 1) / thread_block_2D_dim)

        * thread_block_2D_dim;

  if (max_1D_thread_block_size < thread_block_size_1D) {

    thread_block_size_1D = max_1D_thread_block_size;

  }

  int max_2D_thread_block_dim = std::sqrt(max_1D_thread_block_size);

  if (max_2D_thread_block_dim < thread_block_2D_dim) {

    thread_block_2D_dim = max_2D_thread_block_dim;

  }

  // for small size split in max 2 parts

  if (thread_block_size_1D < 64) {

    thread_block_size_1D = 32;

  }

  if (A.rows() < thread_block_size_1D) {

    thread_block_size_1D = A.rows();

  }


  // pad the input matrix

  int A_rows_padded

      = ((A.rows() + thread_block_size_1D - 1) / thread_block_size_1D)

        * thread_block_size_1D;


  plain_type_t<T> temp(A_rows_padded, A_rows_padded);

  plain_type_t<T> inv_padded = constant(0.0, A_rows_padded, A_rows_padded);

  plain_type_t<T> inv_mat(A);

  plain_type_t<T> zero_mat

      = constant(0.0, A_rows_padded - A.rows(), A_rows_padded);

  if (tri_view == matrix_cl_view::Upper) {

    inv_mat = transpose(inv_mat).eval();

  }

  int work_per_thread

      = opencl_kernels::inv_lower_tri_multiply.get_option("WORK_PER_THREAD");

  // the number of blocks in the first step

  // each block is inverted with using the regular forward substitution

  int parts = inv_padded.rows() / thread_block_size_1D;

  block_zero_based(inv_padded, 0, 0, inv_mat.rows(), inv_mat.rows()) = inv_mat;

  try {

    // create a batch of identity matrices to be used in the first step

    opencl_kernels::batch_identity(

        cl::NDRange(parts, thread_block_size_1D, thread_block_size_1D), temp,

        thread_block_size_1D, temp.size());

    // spawn parts thread blocks, each responsible for one block

    opencl_kernels::diag_inv(cl::NDRange(parts * thread_block_size_1D),

                             cl::NDRange(thread_block_size_1D), inv_padded,

                             temp, inv_padded.rows());

  } catch (cl::Error& e) {

    check_opencl_error("inverse step1", e);

  }

  // set the padded part of the matrix and the upper triangular to zeros

  block_zero_based(inv_padded, inv_mat.rows(), 0, zero_mat.rows(),

                   zero_mat.cols())

      = zero_mat;

  inv_padded.template zeros_strict_tri<stan::math::matrix_cl_view::Upper>();

  if (parts == 1) {

    inv_mat

        = block_zero_based(inv_padded, 0, 0, inv_mat.rows(), inv_mat.rows());

    if (tri_view == matrix_cl_view::Upper) {

      inv_mat = transpose(inv_mat).eval();

    }

    return inv_mat;

  }

  using std::ceil;

  parts = ceil(parts / 2.0);


  auto result_matrix_dim = thread_block_size_1D;

  auto thread_block_work2d_dim = thread_block_2D_dim / work_per_thread;

  auto ndrange_2d

      = cl::NDRange(thread_block_2D_dim, thread_block_work2d_dim, 1);

  while (parts > 0) {

    int result_matrix_dim_x = result_matrix_dim;

    // when calculating the last submatrix

    // we can reduce the size to the actual size (not the next power of 2)

    if (parts == 1 && (inv_padded.rows() - result_matrix_dim * 2) < 0) {

      result_matrix_dim_x = inv_padded.rows() - result_matrix_dim;

    }

    auto result_work_dim = result_matrix_dim / work_per_thread;

    auto result_ndrange

        = cl::NDRange(result_matrix_dim_x, result_work_dim, parts);

    opencl_kernels::inv_lower_tri_multiply(result_ndrange, ndrange_2d,

                                           inv_padded, temp, inv_padded.rows(),

                                           result_matrix_dim);

    opencl_kernels::neg_rect_lower_tri_multiply(

        result_ndrange, ndrange_2d, inv_padded, temp, inv_padded.rows(),

        result_matrix_dim);

    // if this is the last submatrix, end

    if (parts == 1) {

      parts = 0;

    } else {

      parts = ceil(parts / 2.0);

    }

    result_matrix_dim *= 2;

    // set the padded part and upper diagonal to zeros

    block_zero_based(inv_padded, inv_mat.rows(), 0, zero_mat.rows(),

                     zero_mat.cols())

        = zero_mat;

    inv_padded.template zeros_strict_tri<stan::math::matrix_cl_view::Upper>();

  }

  // un-pad and return

  inv_mat = block_zero_based(inv_padded, 0, 0, inv_mat.rows(), inv_mat.rows());

  if (tri_view == matrix_cl_view::Upper) {

    inv_mat = transpose(inv_mat).eval();

  }

  inv_mat.view(tri_view);

  return inv_mat;

}

}  // namespace math

}  // namespace stan


#endif

#endif

batch_identity.hpp

stan::math::opencl_context
The API to access the methods and values in opencl_context_base.
Definition opencl_context.hpp:210

diag_inv.hpp

stan::math::check_triangular
void check_triangular(const char *function, const char *name, const T &A)
Check if the matrix_cl is either upper triangular or lower triangular.
Definition check_triangular.hpp:23

stan::math::check_opencl_error
void check_opencl_error(const char *function, const cl::Error &e)
Throws the domain error with specifying the OpenCL error that occurred.
Definition check_opencl.hpp:23

stan::math::opencl_context::max_thread_block_size
int max_thread_block_size() noexcept
Returns the maximum thread block size defined by CL_DEVICE_MAX_WORK_GROUP_SIZE for the device in the ...
Definition opencl_context.hpp:379

stan::math::block_zero_based
auto block_zero_based(T &&a, int start_row, int start_col, int rows, int cols)
Block of a kernel generator expression.
Definition block_zero_based.hpp:340

stan::math::transpose
auto transpose(Arg &&a)
Transposes a kernel generator expression.
Definition transpose.hpp:139

stan::math::elt_divide
elt_divide_< as_operation_cl_t< T_a >, as_operation_cl_t< T_b > > elt_divide(T_a &&a, T_b &&b)
Definition binary_operation.hpp:209

stan::math::constant
auto constant(const T a, int rows, int cols)
Matrix of repeated values in kernel generator expressions.
Definition constant.hpp:130

stan::math::diagonal
auto diagonal(T &&a)
Diagonal of a kernel generator expression.
Definition diagonal.hpp:136

stan::math::opencl_kernels::diag_inv
const kernel_cl< in_out_buffer, in_out_buffer, int > diag_inv("diag_inv", {indexing_helpers, diag_inv_kernel_code}, {{"THREAD_BLOCK_SIZE", 32}})
See the docs for add() .

stan::math::opencl_kernels::batch_identity
const kernel_cl< out_buffer, int, int > batch_identity("batch_identity", {indexing_helpers, batch_identity_kernel_code})
See the docs for batch_identity() .

stan::math::opencl_kernels::inv_lower_tri_multiply
const kernel_cl< in_buffer, out_buffer, int, int > inv_lower_tri_multiply("inv_lower_tri_multiply", {thread_block_helpers, inv_lower_tri_multiply_kernel_code}, {{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 8}})
See the docs for add() .

stan::math::opencl_kernels::neg_rect_lower_tri_multiply
const kernel_cl< in_out_buffer, in_buffer, int, int > neg_rect_lower_tri_multiply("neg_rect_lower_tri_multiply", {thread_block_helpers, neg_rect_lower_tri_multiply_kernel_code}, {{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 8}})
See the docs for neg_rect_lower_tri_multiply() .

stan::math::tri_inverse
plain_type_t< T > tri_inverse(const T &A)
Computes the inverse of a triangular matrix.
Definition tri_inverse.hpp:40

inv_lower_tri_multiply.hpp

kernel_generator.hpp

matrix_cl.hpp

matrix_cl_view.hpp

stan::math::check_square
void check_square(const char *function, const char *name, const T_y &y)
Check if the specified matrix is square.
Definition check_square.hpp:23

stan::math::e
static constexpr double e()
Return the base of the natural logarithm.
Definition constants.hpp:20

stan::math::matrix_cl_view
matrix_cl_view
Definition matrix_cl_view.hpp:11

stan::math::matrix_cl_view::Upper
@ Upper

stan::math::matrix_cl_view::Entire
@ Entire

stan::math::matrix_cl_view::Diagonal
@ Diagonal

stan::math::ceil
fvar< T > ceil(const fvar< T > &x)
Definition ceil.hpp:13

stan::plain_type_t
typename plain_type< std::decay_t< T > >::type plain_type_t
Definition plain_type.hpp:23

stan
The lgamma implementation in stan-math is based on either the reentrant safe lgamma_r implementation ...
Definition unit_vector_constrain.hpp:15

neg_rect_lower_tri_multiply.hpp

err.hpp

plain_type.hpp

meta.hpp

zeros_strict_tri.hpp