Automatic Differentiation
 
Loading...
Searching...
No Matches
qr_decomposition.hpp
Go to the documentation of this file.
1#ifndef STAN_MATH_OPENCL_QR_DECOMPOSITION_HPP
2#define STAN_MATH_OPENCL_QR_DECOMPOSITION_HPP
3#ifdef STAN_OPENCL
4
13#include <CL/opencl.hpp>
14#include <algorithm>
15#include <vector>
16#include <cmath>
17
18namespace stan {
19namespace math {
20
33template <bool need_Q = true>
36 int r = 100) {
37 using std::copysign;
38 using std::sqrt;
39 int rows = A.rows();
40 int cols = A.cols();
41 if constexpr (need_Q) {
42 Q = identity_matrix<matrix_cl<double>>(rows);
43 }
44 R = A;
45 if (rows <= 1) {
46 return;
47 }
48
49 matrix_cl<double> V_alloc(rows, r);
50 matrix_cl<double> W_alloc(rows, r);
51 Eigen::VectorXd temporary(rows > cols ? rows : cols);
52
53 for (int k = 0; k < std::min(rows - 1, cols); k += r) {
54 int actual_r = std::min({r, rows - k - 1, cols - k});
55 matrix_cl<double> V(V_alloc.buffer(), rows - k, actual_r);
56 matrix_cl<double>& Y_cl = V;
57 matrix_cl<double> W_cl(W_alloc.buffer(), rows - k, actual_r);
58
59 Eigen::MatrixXd R_cpu_block
60 = from_matrix_cl(block_zero_based(R, k, k, rows - k, actual_r));
61
62 Eigen::MatrixXd V_cpu(rows - k, actual_r);
63 V_cpu.triangularView<Eigen::StrictlyUpper>()
64 = Eigen::MatrixXd::Constant(V.rows(), V.cols(), 0);
65
66 for (int j = 0; j < actual_r; j++) {
67 Eigen::VectorXd householder = R_cpu_block.block(j, j, rows - k - j, 1);
68 householder.coeffRef(0)
69 -= copysign(householder.norm(), householder.coeff(0));
70 if (householder.rows() != 1) {
71 householder *= SQRT_TWO / householder.norm();
72 }
73
74 V_cpu.col(j).tail(V.rows() - j) = householder;
75
76 Eigen::VectorXd::AlignedMapType temp_map(temporary.data(), actual_r - j);
77 temp_map.noalias()
78 = R_cpu_block.block(j, j, rows - k - j, actual_r - j).transpose()
79 * householder;
80 R_cpu_block.block(j, j + 1, rows - k - j, actual_r - j - 1).noalias()
81 -= householder * temp_map.tail(actual_r - j - 1).transpose();
82 R_cpu_block.coeffRef(j, j) -= householder.coeff(0) * temp_map.coeff(0);
83 }
84
85 matrix_cl<double> R_back_block = to_matrix_cl(R_cpu_block);
86 R_back_block.view(matrix_cl_view::Upper);
87 block_zero_based(R, k, k, rows - k, actual_r) = R_back_block;
88
89 Eigen::MatrixXd& Y_cpu = V_cpu;
90 Eigen::MatrixXd W_cpu = V_cpu;
91 for (int j = 1; j < actual_r; j++) {
92 Eigen::VectorXd::AlignedMapType temp_map(temporary.data(), j);
93 temp_map.noalias() = Y_cpu.leftCols(j).transpose() * V_cpu.col(j);
94 W_cpu.col(j).noalias() -= W_cpu.leftCols(j) * temp_map;
95 }
96 W_cl = to_matrix_cl(W_cpu);
97 V = to_matrix_cl(V_cpu);
98
99 auto R_block
100 = block_zero_based(R, k, k + actual_r, rows - k, cols - k - actual_r);
101 R_block -= Y_cl * (transpose(W_cl) * R_block);
102
103 if constexpr (need_Q) {
104 auto Q_block = block_zero_based(Q, 0, k, Q.rows(), Q.cols() - k);
105 Q_block -= (Q_block * W_cl) * transpose(Y_cl);
106 }
107 }
108
110}
111
112} // namespace math
113} // namespace stan
114#endif
115#endif // STAN_MATH_OPENCL_QR_DECOMPOSITION_HPP
const cl::Buffer & buffer() const
const matrix_cl_view & view() const
Definition matrix_cl.hpp:70
Represents an arithmetic matrix on the OpenCL device.
Definition matrix_cl.hpp:47
auto block_zero_based(T &&a, int start_row, int start_col, int rows, int cols)
Block of a kernel generator expression.
auto transpose(Arg &&a)
Transposes a kernel generator expression.
int64_t cols(const T_x &x)
Returns the number of columns in the specified kernel generator expression.
Definition cols.hpp:21
int64_t rows(const T_x &x)
Returns the number of rows in the specified kernel generator expression.
Definition rows.hpp:22
matrix_cl< scalar_type_t< T > > to_matrix_cl(T &&src)
Copies the source Eigen matrix, std::vector or scalar to the destination matrix that is stored on the...
Definition copy.hpp:45
auto from_matrix_cl(const T &src)
Copies the source matrix that is stored on the OpenCL device to the destination Eigen matrix.
Definition copy.hpp:61
static constexpr double SQRT_TWO
The value of the square root of 2, .
void qr_decomposition_cl(const matrix_cl< double > &A, matrix_cl< double > &Q, matrix_cl< double > &R, int r=100)
Calculates QR decomposition of A using the block Householder algorithm.
The lgamma implementation in stan-math is based on either the reentrant safe lgamma_r implementation ...