Namespaces
namespace	internal

Classes
struct	cumulative_sum
	struct containing cumulative_sum kernels, grouped by scalar type. More...

struct	cumulative_sum< double, T >

struct	cumulative_sum< int, T >

struct	in_buffer
	An in_buffer signifies a cl::Buffer argument used as input. More...

struct	in_out_buffer
	An in_out_buffer signifies a cl::Buffer argument used as both input and output. More...

struct	kernel_cl
	Creates functor for kernels. More...

struct	out_buffer
	An out_buffer signifies a cl::Buffer argument used as output. More...

struct	sort_asc
	struct containing sort_asc kernels, grouped by scalar type. More...

struct	sort_asc< double, T >

struct	sort_asc< int, T >

struct	sort_desc
	struct containing sort_desc kernels, grouped by scalar type. More...

struct	sort_desc< double, T >

struct	sort_desc< int, T >

Functions
auto	compile_kernel (const char *name, const std::vector< std::string > &sources, const std::unordered_map< std::string, int > &options)
	Compile an OpenCL kernel.

__kernel void	add_batch (__global double B, __global double A, unsigned int rows, unsigned int cols, unsigned int batch_size)
	Sums a batch of matrices.

__kernel void	batch_identity (__global double *A, unsigned int batch_rows, unsigned int size)
	Makes a batch of smaller identity matrices inside the input matrix.

__kernel void	categorical_logit_glm (__global double logp_global, __global double exp_lin_global, __global double inv_sum_exp_lin_global, __global double neg_softmax_lin_global, __global double alpha_derivative, const __global int y_global, const __global double x_beta_global, const __global double alpha_global, const int N_instances, const int N_attributes, const int N_classes, const int is_y_vector, const int need_alpha_derivative, const int need_neg_softmax_lin_global)
	GPU implementation of Generalized Linear Model (GLM) with categorical distribution and logit (softmax) link function.

__kernel void	categorical_logit_glm_beta_derivative (__global double beta_derivative, __global double temp, const __global int y, const __global double x, const int N_instances, const int N_attributes, const int N_classes, const int is_y_vector)
	Calculates derivative wrt beta.

__kernel void	is_symmetric (__global double A, __global int flag, unsigned int rows, unsigned int cols, double tolerance)
	Check if the `matrix_cl` is symmetric.

__kernel void	cholesky_decompose (__global double *A, int rows)
	Calculates the Cholesky Decomposition of a matrix on an OpenCL.

__kernel void	cumulative_sum1 (__global SCAL out_wgs, __global SCAL out_threads, __global SCAL *in, int size)
	First kernel of the cumulative sum implementation.

__kernel void	cumulative_sum2 (__global SCAL *data, int size)
	Second kernel of the cumulative sum implementation.

__kernel void	cumulative_sum3 (__global SCAL out, __global SCAL in_data, __global SCAL in_threads, __global SCAL in_wgs, int size)
	Third kernel of the cumulative sum implementation.

void	atomic_add_double (__global double *val, double delta)
	Atomically add to a double value.

void	local_atomic_add_double (__local double *val, double delta)
	Atomically add to a local double value.

double	beta (double a, double b)
	Return the beta function applied to the specified arguments.

double	binomial_coefficient_log (double n, double k)
	Return the log of the binomial coefficient for the specified arguments.

double	digamma (double x)
	Calculates the digamma function - derivative of logarithm of gamma.

double	inv_logit (double x)
	Returns the inverse logit function applied to the kernel generator expression.

double	inv_Phi (double p)
	Return the inv_Phi function applied to the specified argument.

double	inv_square (double x)
	Calculates `1 / (x*x)`

double	lbeta (double a, double b)
	Return the log of the beta function applied to the specified arguments.

double	lgamma_stirling (double x)
	Return the Stirling approximation to the lgamma function.

double	lgamma_stirling_diff (double x)
	Return the difference between log of the gamma function and its Stirling approximation.

double	lmultiply (double a, double b)
	Return the first argument times the natural log of the second argument if either argument is non-zero and 0 if both arguments are 0.

double	log1m (double a)
	Calculates the natural logarithm of one minus the specified value.

double	log1m_exp (double a)
	Calculates the natural logarithm of one minus the exponential of the specified value without overflow,.

double	log1m_inv_logit (double x)
	Return the the natural logarithm of 1 minus the inverse logit applied to the kernel generator expression.

double	log1p_exp (double a)
	Calculates the log of 1 plus the exponential of the specified value without overflow.

double	log_diff_exp (double x, double y)
	The natural logarithm of the difference of the natural exponentiation of x and the natural exponentiation of y.

double	log_inv_logit (double x)
	Return the natural logarithm of the inverse logit of the specified argument.

double	log_inv_logit_diff (double x, double y)
	Returns the natural logarithm of the difference of the inverse logits of the specified arguments.

double	logit (double x)
	Return the log odds applied to the kernel generator expression.

double	multiply_log (double a, double b)
	Calculate the value of the first argument times log of the second argument while behaving properly with 0 inputs.

double	Phi (double x)
	Return the Phi function applied to the specified argument.

double	Phi_approx (double x)
	Return the Phi_approx function applied to the specified argument.

double	trigamma (double x)
	Return the trigamma function applied to the argument.

__kernel void	diag_inv (__global double A, __global double tmp_inv, int rows)
	Calculates inplace submatrix inversions along the matrix diagonal.

__kernel void	divide_columns_vec (__global double A, __global double vec, int vec_size)
	Takes vector A and divides columns vector in A element-wise by the values in vec.

__kernel void	fill_strict_tri (__global double *A, double val, unsigned int rows, unsigned int cols, unsigned int view_A)
	Stores constant in the triangular part of a matrix on the OpenCL device.

__kernel void	gp_exp_quad_cov (const __global double x, __global double res, const double sigma_sq, const double neg_half_inv_l_sq, const int size, const int element_size)
	GPU part of calculation of squared exponential kernel.

__kernel void	gp_exp_quad_cov_cross (const __global double x1, const __global double x2, __global double *res, const double sigma_sq, const double neg_half_inv_l_sq, const int size1, const int size2, const int element_size)
	GPU part of calculation of squared exponential kernel.

__kernel void	gp_exponential_cov (const __global double x, __global double res, const double sigma_sq, const double neg_inv_l, const int size, const int element_size)
	GPU part of calculation of Matern exponential kernel.

__kernel void	gp_exponential_cov_cross (const __global double x1, const __global double x2, __global double *res, const double sigma_sq, const double neg_inv_l, const int size1, const int size2, const int element_size)
	GPU part of calculation of Matern exponential kernel.

__kernel void	gp_matern32_cov (const __global double x, __global double res, const double sigma_sq, const double root_3_inv_l, const int size, const int element_size)
	GPU part of calculation of Matern 3/2 kernel.

__kernel void	gp_matern32_cov_cross (const __global double x1, const __global double x2, __global double *res, const double sigma_sq, const double root_3_inv_l, const int size1, const int size2, const int element_size)
	GPU part of calculation of Matern 3/2 kernel.

__kernel void	gp_matern52_cov (const __global double x, __global double res, const double sigma_sq, const double root_5_inv_l, const double inv_l_sq_5_3, const int size, const int element_size)
	GPU part of calculation of Matern 5/2 kernel.

__kernel void	gp_matern52_cov_cross (const __global double x1, const __global double x2, __global double *res, const double sigma_sq, const double root_5_inv_l, const double inv_l_sq_5_3, const int size1, const int size2, const int element_size)
	GPU part of calculation of Matern 5/2 kernel.

__kernel void	indexing_rev (__global double adj, const __global int index, const __global double *res, int size)
	Increments adjoint of the indexing operation argument given the indices and adjoints of the indexing result.

__kernel void	indexing_rev (__global double adj, const __global int index, const __global double res, __local double adj_loc, int index_size, int adj_size)
	Increments adjoint of the indexing operation argument given the indices and adjoints of the indexing result.

__kernel void	inv_lower_tri_multiply (__global double A, __global double temp, const int A_rows, const int rows)
	Calculates B = C * A.

__kernel void	matrix_multiply (const __global double A, const __global double B, __global double *C, const int M, const int N, const int K, unsigned int view_A, unsigned int view_B)
	Matrix multiplication on the OpenCL device.

__kernel void	row_vector_matrix_multiply (const __global double A, const __global double B, __global double *R, const int N, const int K, unsigned int view_A, unsigned int view_B)
	Row vector-matrix multiplication R=A*B on the OpenCL device.

void	merge (__global SCAL A, __global SCAL B, __global SCAL *res, int A_size, int B_size)
	Merges two sorted runs into a single sorted run of combined length.

int	binary_search (__global SCAL *input, int start, int end, SCAL value)
	Searches for the index of the element that is larger than or equal to given `value` in given range.

__kernel void	merge_step (__global SCAL output, __global SCAL input, int run_len, int size, int tasks)
	Merges sorted runs into longer sorted runs.

int	get_sturm_count_tri (const __global double diagonal, const __global double subdiagonal_squared, const double shift, const int n)
	Calculates lower Sturm count of a tridiagonal matrix T - number of eigenvalues lower than shift.

void	eigenvals_bisect (const __global double diagonal, const __global double subdiagonal_squared, double low_res, double high_res, const double min_eigval, const double max_eigval, const int n, const int i)
	Calculates i-th largest eigenvalue of tridiagonal matrix represented by a LDL decomposition using bisection.

int	get_sturm_count_ldl (const __global double_d l, const __global double_d d, const double_d shift, const int n)
	Calculates Sturm count of a LDL decomposition of a tridiagonal matrix - number of eigenvalues larger or equal to shift.

void	eigenvals_bisect_refine (const __global double_d l, const __global double_d d, double_d low_res, double_d high_res, const int n, const int i)
	Refines bounds on the i-th largest eigenvalue of a LDL decomposition using bisection.

__kernel void	eigenvals (const __global double diagonal, const __global double subdiagonal_squared, const __global double_d l, const __global double_d d, __global double eigval_global, __global double_d shifted_low_global, __global double_d *shifted_high_global, const double min_eigval, const double max_eigval, const double shift, const char do_refine)
	Calculates eigenvalues of a tridiagonal matrix T and refines shifted eigenvalues using shifted LDL decomposition of T.

int	get_twisted_factorization (const __global double_d l, const __global double_d d, double_d shift, __global double_d l_plus, __global double_d u_minus, __global double_d *s)
	Calculates shifted LDL and UDU factorizations.

void	calculate_eigenvector (const __global double_d l_plus, const __global double_d u_minus, const __global double subdiag, int twist_idx, __global double eigenvectors)
	Calculates an eigenvector from twisted factorization T - shift * I = L+.

__kernel void	get_eigenvectors (const __global double_d l, const __global double_d d, const __global double subdiag, const __global double_d shifted_eigvals, __global double_d l_plus, __global double_d u_minus, __global double_d temp, __global double eigenvectors)
	Calculates eigenvectors for (shifted) eigenvalues.

__kernel void	multiply_transpose (const __global double A, __global double B, const int M, const int N)
	Matrix multiplication of the form A*A^T on the OpenCL device.

__kernel void	neg_binomial_2_log_glm (__global double logp_global, __global double theta_derivative_global, __global double theta_derivative_sum, __global double phi_derivative_global, const __global int y_global, const __global double x, const __global double alpha, const __global double beta, const __global double *phi_global, const int N, const int M, const int is_y_vector, const int is_alpha_vector, const int is_phi_vector, const int need_theta_derivative, const int need_theta_derivative_sum, const int need_phi_derivative, const int need_phi_derivative_sum, const int need_logp1, const int need_logp2, const int need_logp3, const int need_logp4)
	GPU implementation of Generalized Linear Model (GLM) with Negative-Binomial-2 distribution and log link function.

__kernel void	neg_rect_lower_tri_multiply (__global double A, const __global double temp, const int A_rows, const int rows)
	Calculates C = -B * A where B is rectangular and A is a lower triangular.

__kernel void	ordered_logistic_glm (__global double location_sum, __global double logp_global, __global double location_derivative, __global double cuts_derivative, const __global int y_global, const __global double x, const __global double beta, const __global double cuts, const int N_instances, const int N_attributes, const int N_classes, const int is_y_vector, const int need_location_derivative, const int need_cuts_derivative)
	GPU implementation of ordinal regression Generalized Linear Model (GLM).

__kernel void	ordered_logistic (__global double logp_global, __global double lambda_derivative, __global double cuts_derivative, const __global int y_global, const __global double lambda_global, const __global double cuts, const int N_instances, const int N_classes, const int is_y_vector, const int is_cuts_matrix, const int need_lambda_derivative, const int need_cuts_derivative)
	GPU implementation of ordinal regression.

__kernel void	pack (__global double A, __global double B, unsigned int rows, unsigned int cols, unsigned int view)
	Packs a flat matrix to a packed triangular matrix.

__kernel void	rep_matrix_rev (__global double A_adj, __global double B_adj, unsigned int B_rows, unsigned int B_cols, unsigned int view_B)
	Implements reverse pass of rep_matrix.

__kernel void	tridiagonalization_householder (__global double P, __global double V, __global double *q_glob, const int P_rows, const int V_rows, const int j, const int k)
	Calculates householder vector and first element of the vector v.

__kernel void	tridiagonalization_v_step_1 (const __global double P, const __global double V, __global double Uu, __global double Vu, const int P_rows, const int V_rows, const int k)
	Calculates first part of constructing the vector v: Uu = Pb * u and Vu = Vl * u.

__kernel void	tridiagonalization_v_step_2 (const __global double P, __global double V, const __global double Uu, const __global double Vu, const int P_rows, const int V_rows, const int k, const int j)
	Second part in constructing vector v: v = Pb * u + V * Uu + U * Vu.

__kernel void	tridiagonalization_v_step_3 (__global double P, __global double V, __global double *q, const int P_rows, const int V_rows, const int k, const int j)
	Third part in constructing vector v: v-=0.5(v^Tu)*u, where u is the householder vector.

__kernel void	unpack (__global double B, __global double A, unsigned int rows, unsigned int cols, unsigned int view)
	Unpacks a packed triangular matrix to a flat matrix.

Variables
const kernel_cl< out_buffer, in_buffer, int, int, int >	add_batch ("add_batch", {indexing_helpers, add_batch_kernel_code})
	See the docs for add_batch() .

const kernel_cl< out_buffer, int, int >	batch_identity ("batch_identity", {indexing_helpers, batch_identity_kernel_code})
	See the docs for batch_identity() .

const kernel_cl< out_buffer, out_buffer, out_buffer, out_buffer, out_buffer, in_buffer, in_buffer, in_buffer, int, int, int, int, int, int >	categorical_logit_glm ("categorical_logit_glm", {categorical_logit_glm_kernel_code}, {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}})
	See the docs for categorical_logit_glm() .

const kernel_cl< in_out_buffer, in_out_buffer, in_buffer, in_buffer, int, int, int, int >	categorical_logit_glm_beta_derivative ("categorical_logit_glm_beta_derivative", {categorical_logit_glm_beta_derivative_kernel_code})
	See the docs for categorical_logit_glm_beta_derivative() .

const kernel_cl< in_buffer, out_buffer, int, int, const double >	check_symmetric ("is_symmetric", {indexing_helpers, is_symmetric_kernel_code})
	See the docs for check_symmetric() .

const kernel_cl< in_out_buffer, int >	cholesky_decompose ("cholesky_decompose", {indexing_helpers, cholesky_decompose_kernel_code})
	See the docs for cholesky_decompose() .

const kernel_cl< in_out_buffer, in_out_buffer, int >	diag_inv ("diag_inv", {indexing_helpers, diag_inv_kernel_code}, {{"THREAD_BLOCK_SIZE", 32}})
	See the docs for add() .

const kernel_cl< out_buffer, in_buffer, int >	divide_columns_vec ("divide_columns_vec", {indexing_helpers, divide_columns_kernel_code})
	See the docs for divide_columns_vec() .

const kernel_cl< out_buffer, double, int, int, matrix_cl_view >	fill_strict_tri ("fill_strict_tri", {indexing_helpers, view_kernel_helpers, fill_strict_tri_kernel_code})
	See the docs for fill_strict_tri_kernel_code() .

const kernel_cl< in_buffer, out_buffer, double, double, int, int >	gp_exp_quad_cov ("gp_exp_quad_cov", {gp_exp_quad_cov_kernel_code})
	See the docs for gp_exp_quad_cov() .

const kernel_cl< in_buffer, in_buffer, out_buffer, double, double, int, int, int >	gp_exp_quad_cov_cross ("gp_exp_quad_cov_cross", {gp_exp_quad_cov_cross_kernel_code})
	See the docs for gp_exp_quad_cov_cross() .

const kernel_cl< in_buffer, out_buffer, double, double, int, int >	gp_exponential_cov ("gp_exponential_cov", {gp_exponential_cov_kernel_code})
	See the docs for gp_exponential_cov() .

const kernel_cl< in_buffer, in_buffer, out_buffer, double, double, int, int, int >	gp_exponential_cov_cross ("gp_exponential_cov_cross", {gp_exponential_cov_cross_kernel_code})
	See the docs for gp_exponential_cov_cross() .

const kernel_cl< in_buffer, out_buffer, double, double, int, int >	gp_matern32_cov ("gp_matern32_cov", {gp_matern32_cov_kernel_code})
	See the docs for gp_matern32_cov() .

const kernel_cl< in_buffer, in_buffer, out_buffer, double, double, int, int, int >	gp_matern32_cov_cross ("gp_matern32_cov_cross", {gp_matern32_cov_cross_kernel_code})
	See the docs for gp_matern32_cov_cross() .

const kernel_cl< in_buffer, out_buffer, double, double, double, int, int >	gp_matern52_cov ("gp_matern52_cov", {gp_matern52_cov_kernel_code})
	See the docs for gp_matern52_cov() .

const kernel_cl< in_buffer, in_buffer, out_buffer, double, double, double, int, int, int >	gp_matern52_cov_cross ("gp_matern52_cov_cross", {gp_matern52_cov_cross_kernel_code})
	See the docs for gp_matern52_cov_cross() .

static const std::string	indexing_helpers
	Defines helper macros for common matrix indexing operations.

static const std::string	thread_block_helpers
	Defines a helper macro for kernels with 2D local size.

const kernel_cl< in_out_buffer, in_buffer, in_buffer, int >	indexing_rev_global_atomic ("indexing_rev", {atomic_add_double_device_function, indexing_rev_global_atomic_kernel_code})
	See the docs for add_batch() .

const kernel_cl< in_out_buffer, in_buffer, in_buffer, cl::LocalSpaceArg, int, int >	indexing_rev_local_atomic ("indexing_rev", {atomic_add_double_device_function, indexing_rev_local_atomic_kernel_code})
	See the docs for add_batch() .

const kernel_cl< in_out_buffer, in_buffer, in_buffer, cl::LocalSpaceArg, int, int >	indexing_rev_local_independent ("indexing_rev", {atomic_add_double_device_function, indexing_rev_local_independent_kernel_code})
	See the docs for add_batch() .

const kernel_cl< in_buffer, out_buffer, int, int >	inv_lower_tri_multiply ("inv_lower_tri_multiply", {thread_block_helpers, inv_lower_tri_multiply_kernel_code}, {{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 8}})
	See the docs for add() .

const kernel_cl< in_buffer, in_buffer, out_buffer, int, int, int, matrix_cl_view, matrix_cl_view >	matrix_multiply ("matrix_multiply", {thread_block_helpers, view_kernel_helpers, matrix_multiply_kernel_code}, {{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 8}})
	See the docs for matrix_multiply() .

const kernel_cl< in_buffer, in_buffer, out_buffer, int, int, matrix_cl_view, matrix_cl_view >	row_vector_matrix_multiply ("row_vector_matrix_multiply", {view_kernel_helpers, row_vector_matrix_multiply_kernel_code}, {{"LOCAL_SIZE_", 64}, {"REDUCTION_STEP_SIZE", 4}})
	See the docs for row_vector_matrix_multiply() .

const kernel_cl< in_buffer, in_buffer, in_buffer, in_buffer, out_buffer, out_buffer, out_buffer, double, double, double, char >	eigenvals ("eigenvals", {stan::math::internal::double_d_src, eigenvals_bisect_kernel_code})

const kernel_cl< in_buffer, in_buffer, in_buffer, in_buffer, in_out_buffer, in_out_buffer, in_out_buffer, out_buffer >	get_eigenvectors ("get_eigenvectors", {stan::math::internal::double_d_src, get_eigenvectors_kernel_code})

const kernel_cl< in_buffer, out_buffer, int, int >	multiply_transpose ("multiply_transpose", {thread_block_helpers, multiply_transpose_kernel_code}, {{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 4}})
	See the docs for add() .

const kernel_cl< out_buffer, out_buffer, out_buffer, out_buffer, in_buffer, in_buffer, in_buffer, in_buffer, in_buffer, int, int, int, int, int, int, int, int, int, int, int, int, int >	neg_binomial_2_log_glm ("neg_binomial_2_log_glm", {digamma_device_function, log1p_exp_device_function, neg_binomial_2_log_glm_kernel_code}, {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}})
	See the docs for neg_binomial_2_log_glm_lpmf() .

const kernel_cl< in_out_buffer, in_buffer, int, int >	neg_rect_lower_tri_multiply ("neg_rect_lower_tri_multiply", {thread_block_helpers, neg_rect_lower_tri_multiply_kernel_code}, {{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 8}})
	See the docs for neg_rect_lower_tri_multiply() .

const kernel_cl< out_buffer, out_buffer, out_buffer, out_buffer, in_buffer, in_buffer, in_buffer, in_buffer, int, int, int, int, int, int >	ordered_logistic_glm ("ordered_logistic_glm", {log1p_exp_device_function, log1m_exp_device_function, inv_logit_device_function, ordered_logistic_glm_kernel_code}, {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}})
	See the docs for ordered_logistic_glm() .

const kernel_cl< out_buffer, out_buffer, out_buffer, in_buffer, in_buffer, in_buffer, int, int, int, int, int, int >	ordered_logistic ("ordered_logistic", {log1p_exp_device_function, log1m_exp_device_function, inv_logit_device_function, ordered_logistic_kernel_code}, {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}})
	See the docs for ordered_logistic() .

const kernel_cl< out_buffer, in_buffer, int, int, matrix_cl_view >	pack ("pack", {indexing_helpers, pack_kernel_code})
	See the docs for pack() .

const kernel_cl< in_out_buffer, in_buffer, int, int, matrix_cl_view >	rep_matrix_rev ("rep_matrix_rev", {view_kernel_helpers, rep_matrix_rev_kernel_code})
	See the docs for rep_matrix_rev() .

const kernel_cl< in_out_buffer, in_out_buffer, out_buffer, int, int, int, int >	tridiagonalization_householder ("tridiagonalization_householder", {tridiagonalization_householder_kernel_code}, {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 1024}})

const kernel_cl< in_buffer, in_buffer, out_buffer, out_buffer, int, int, int >	tridiagonalization_v_step_1 ("tridiagonalization_v_step_1", {tridiagonalization_v_step_1_kernel_code}, {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}})

const kernel_cl< in_buffer, out_buffer, in_buffer, in_buffer, int, int, int, int >	tridiagonalization_v_step_2 ("tridiagonalization_v_step_2", {tridiagonalization_v_step_2_kernel_code}, {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}})

const kernel_cl< in_out_buffer, in_out_buffer, out_buffer, int, int, int, int >	tridiagonalization_v_step_3 ("tridiagonalization_v_step_3", {tridiagonalization_v_step_3_kernel_code}, {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 1024}})

const kernel_cl< out_buffer, in_buffer, int, int, matrix_cl_view >	unpack ("unpack", {indexing_helpers, unpack_kernel_code})
	See the docs for unpack() .

Table of contents

Namespaces

Classes

Functions

Variables