math/opencl_2cholesky__decompose_8hpp_source.html

#ifndef STAN_MATH_OPENCL_CHOLESKY_DECOMPOSE_HPP

#define STAN_MATH_OPENCL_CHOLESKY_DECOMPOSE_HPP

#ifdef STAN_OPENCL


#include <stan/math/opencl/matrix_cl.hpp>

#include <stan/math/opencl/matrix_cl_view.hpp>

#include <stan/math/opencl/err.hpp>

#include <stan/math/opencl/multiply_transpose.hpp>

#include <stan/math/opencl/prim/multiply.hpp>

#include <stan/math/opencl/tri_inverse.hpp>

#include <stan/math/opencl/kernels/cholesky_decompose.hpp>

#include <stan/math/opencl/kernel_generator.hpp>

#include <stan/math/prim/meta.hpp>

#include <CL/opencl.hpp>

#include <algorithm>

#include <cmath>


namespace stan {

namespace math {

namespace opencl {

template <typename T, typename = require_floating_point_t<T>>

inline void cholesky_decompose(matrix_cl<T>& A) {

  if (A.rows() == 0) {

    return;

  }

  // Repeats the blocked cholesky decomposition until the size of the remaining

  // submatrix is smaller or equal to the minimum blocks size

  // or a heuristic of 100.

  // The Cholesky (OpenCL) algorithm only uses one local block so we need the

  // matrix To be less than the max thread block size.

  if (A.rows() <= opencl_context.tuning_opts().cholesky_min_L11_size) {

    try {

      opencl_kernels::cholesky_decompose(cl::NDRange(A.rows()),

                                         cl::NDRange(A.rows()), A, A.rows());

    } catch (const cl::Error& e) {

      check_opencl_error("cholesky_decompose", e);

    }

    A.view(matrix_cl_view::Lower);

    return;

  }

  // NOTE: The code in this section follows the naming conventions

  // in the report linked in the docs.

  const int block

      = std::floor(A.rows() / opencl_context.tuning_opts().cholesky_partition);

  // Subset the top left block of the input A into A_11

  matrix_cl<T> A_11 = block_zero_based(A, 0, 0, block, block);

  // The following function either calls the

  // blocked cholesky recursively for the submatrix A_11

  // or calls the kernel  directly if the size of the block is small enough

  opencl::cholesky_decompose(A_11);

  // Copies L_11 back to the input matrix

  block_zero_based(A, 0, 0, block, block)

      = block_zero_based(A_11, 0, 0, block, block);


  const int block_subset = A.rows() - block;

  matrix_cl<T> A_21 = block_zero_based(A, block, 0, block_subset, block);

  // computes A_21*((L_11^-1)^T)

  // and copies the resulting submatrix to the lower left hand corner of A

  matrix_cl<T> L_21 = A_21 * transpose(tri_inverse(A_11));

  block_zero_based(A, block, 0, block_subset, block)

      = block_zero_based(L_21, 0, 0, block_subset, block);

  matrix_cl<T> A_22

      = block_zero_based(A, block, block, block_subset, block_subset);

  // computes A_22 - L_21*(L_21^T)

  matrix_cl<T> L_22 = A_22 - multiply_transpose(L_21);

  // copy L_22 into A's lower left hand corner

  opencl::cholesky_decompose(L_22);

  block_zero_based(A, block, block, block_subset, block_subset)

      = block_zero_based(L_22, 0, 0, block_subset, block_subset);

  A.view(matrix_cl_view::Lower);

}

}  // namespace opencl

}  // namespace math

}  // namespace stan


#endif

#endif

stan::math::matrix_cl::rows
int rows() const
Definition matrix_cl.hpp:64

stan::math::matrix_cl::view
const matrix_cl_view & view() const
Definition matrix_cl.hpp:70

stan::math::matrix_cl
Represents an arithmetic matrix on the OpenCL device.
Definition matrix_cl.hpp:47

stan::math::opencl_context
The API to access the methods and values in opencl_context_base.
Definition opencl_context.hpp:210

stan::math::check_opencl_error
void check_opencl_error(const char *function, const cl::Error &e)
Throws the domain error with specifying the OpenCL error that occurred.
Definition check_opencl.hpp:23

stan::math::opencl_context::tuning_opts
opencl_context_base::tuning_struct & tuning_opts() noexcept
Returns the thread block size for the Cholesky Decompositions L_11.
Definition opencl_context.hpp:386

stan::math::block_zero_based
auto block_zero_based(T &&a, int start_row, int start_col, int rows, int cols)
Block of a kernel generator expression.
Definition block_zero_based.hpp:340

stan::math::transpose
auto transpose(Arg &&a)
Transposes a kernel generator expression.
Definition transpose.hpp:139

stan::math::opencl_kernels::cholesky_decompose
const kernel_cl< in_out_buffer, int > cholesky_decompose("cholesky_decompose", {indexing_helpers, cholesky_decompose_kernel_code})
See the docs for cholesky_decompose() .

stan::math::tri_inverse
plain_type_t< T > tri_inverse(const T &A)
Computes the inverse of a triangular matrix.
Definition tri_inverse.hpp:40

stan::math::opencl::cholesky_decompose
void cholesky_decompose(matrix_cl< T > &A)
Performs an in-place computation of the lower-triangular Cholesky factor (i.e., matrix square root) o...
Definition cholesky_decompose.hpp:43

stan::math::multiply_transpose
matrix_cl< T > multiply_transpose(const matrix_cl< T > &A)
Computes the product of a square OpenCL matrix with its transpose.
Definition multiply_transpose.hpp:24

kernel_generator.hpp

matrix_cl.hpp

matrix_cl_view.hpp

multiply_transpose.hpp

stan::math::e
static constexpr double e()
Return the base of the natural logarithm.
Definition constants.hpp:20

stan::math::block
auto block(T_x &&x, size_t i, size_t j, size_t nrows, size_t ncols)
Return a nrows x ncols submatrix starting at (i-1, j-1).
Definition block.hpp:24

stan::math::matrix_cl_view::Lower
@ Lower

stan
The lgamma implementation in stan-math is based on either the reentrant safe lgamma_r implementation ...
Definition unit_vector_constrain.hpp:15

err.hpp

cholesky_decompose.hpp

multiply.hpp

meta.hpp

stan::math::opencl_context_base::tuning_struct::cholesky_partition
int cholesky_partition
Definition opencl_context.hpp:178

stan::math::opencl_context_base::tuning_struct::cholesky_min_L11_size
int cholesky_min_L11_size
Definition opencl_context.hpp:177

tri_inverse.hpp