Automatic Differentiation
 
Loading...
Searching...
No Matches
stan::math::opencl_kernels Namespace Reference

Namespaces

namespace  internal
 

Classes

struct  cumulative_sum
 struct containing cumulative_sum kernels, grouped by scalar type. More...
 
struct  cumulative_sum< double, T >
 
struct  cumulative_sum< int, T >
 
struct  in_buffer
 An in_buffer signifies a cl::Buffer argument used as input. More...
 
struct  in_out_buffer
 An in_out_buffer signifies a cl::Buffer argument used as both input and output. More...
 
struct  kernel_cl
 Creates functor for kernels. More...
 
struct  out_buffer
 An out_buffer signifies a cl::Buffer argument used as output. More...
 
struct  sort_asc
 struct containing sort_asc kernels, grouped by scalar type. More...
 
struct  sort_asc< double, T >
 
struct  sort_asc< int, T >
 
struct  sort_desc
 struct containing sort_desc kernels, grouped by scalar type. More...
 
struct  sort_desc< double, T >
 
struct  sort_desc< int, T >
 

Functions

auto compile_kernel (const char *name, const std::vector< std::string > &sources, const std::unordered_map< std::string, int > &options)
 Compile an OpenCL kernel.
 
__kernel void add_batch (__global double *B, __global double *A, unsigned int rows, unsigned int cols, unsigned int batch_size)
 Sums a batch of matrices.
 
__kernel void batch_identity (__global double *A, unsigned int batch_rows, unsigned int size)
 Makes a batch of smaller identity matrices inside the input matrix.
 
__kernel void categorical_logit_glm (__global double *logp_global, __global double *exp_lin_global, __global double *inv_sum_exp_lin_global, __global double *neg_softmax_lin_global, __global double *alpha_derivative, const __global int *y_global, const __global double *x_beta_global, const __global double *alpha_global, const int N_instances, const int N_attributes, const int N_classes, const int is_y_vector, const int need_alpha_derivative, const int need_neg_softmax_lin_global)
 GPU implementation of Generalized Linear Model (GLM) with categorical distribution and logit (softmax) link function.
 
__kernel void categorical_logit_glm_beta_derivative (__global double *beta_derivative, __global double *temp, const __global int *y, const __global double *x, const int N_instances, const int N_attributes, const int N_classes, const int is_y_vector)
 Calculates derivative wrt beta.
 
__kernel void is_symmetric (__global double *A, __global int *flag, unsigned int rows, unsigned int cols, double tolerance)
 Check if the matrix_cl is symmetric.
 
__kernel void cholesky_decompose (__global double *A, int rows)
 Calculates the Cholesky Decomposition of a matrix on an OpenCL.
 
__kernel void cumulative_sum1 (__global SCAL *out_wgs, __global SCAL *out_threads, __global SCAL *in, int size)
 First kernel of the cumulative sum implementation.
 
__kernel void cumulative_sum2 (__global SCAL *data, int size)
 Second kernel of the cumulative sum implementation.
 
__kernel void cumulative_sum3 (__global SCAL *out, __global SCAL *in_data, __global SCAL *in_threads, __global SCAL *in_wgs, int size)
 Third kernel of the cumulative sum implementation.
 
void atomic_add_double (__global double *val, double delta)
 Atomically add to a double value.
 
void local_atomic_add_double (__local double *val, double delta)
 Atomically add to a local double value.
 
double beta (double a, double b)
 Return the beta function applied to the specified arguments.
 
double binomial_coefficient_log (double n, double k)
 Return the log of the binomial coefficient for the specified arguments.
 
double digamma (double x)
 Calculates the digamma function - derivative of logarithm of gamma.
 
double inv_logit (double x)
 Returns the inverse logit function applied to the kernel generator expression.
 
double inv_Phi (double p)
 Return the inv_Phi function applied to the specified argument.
 
double inv_square (double x)
 Calculates 1 / (x*x)
 
double lbeta (double a, double b)
 Return the log of the beta function applied to the specified arguments.
 
double lgamma_stirling (double x)
 Return the Stirling approximation to the lgamma function.
 
double lgamma_stirling_diff (double x)
 Return the difference between log of the gamma function and its Stirling approximation.
 
double lmultiply (double a, double b)
 Return the first argument times the natural log of the second argument if either argument is non-zero and 0 if both arguments are 0.
 
double log1m (double a)
 Calculates the natural logarithm of one minus the specified value.
 
double log1m_exp (double a)
 Calculates the natural logarithm of one minus the exponential of the specified value without overflow,.
 
double log1m_inv_logit (double x)
 Return the the natural logarithm of 1 minus the inverse logit applied to the kernel generator expression.
 
double log1p_exp (double a)
 Calculates the log of 1 plus the exponential of the specified value without overflow.
 
double log_diff_exp (double x, double y)
 The natural logarithm of the difference of the natural exponentiation of x and the natural exponentiation of y.
 
double log_inv_logit (double x)
 Return the natural logarithm of the inverse logit of the specified argument.
 
double log_inv_logit_diff (double x, double y)
 Returns the natural logarithm of the difference of the inverse logits of the specified arguments.
 
double logit (double x)
 Return the log odds applied to the kernel generator expression.
 
double multiply_log (double a, double b)
 Calculate the value of the first argument times log of the second argument while behaving properly with 0 inputs.
 
double Phi (double x)
 Return the Phi function applied to the specified argument.
 
double Phi_approx (double x)
 Return the Phi_approx function applied to the specified argument.
 
double trigamma (double x)
 Return the trigamma function applied to the argument.
 
__kernel void diag_inv (__global double *A, __global double *tmp_inv, int rows)
 Calculates inplace submatrix inversions along the matrix diagonal.
 
__kernel void divide_columns_vec (__global double *A, __global double *vec, int vec_size)
 Takes vector A and divides columns vector in A element-wise by the values in vec.
 
__kernel void fill_strict_tri (__global double *A, double val, unsigned int rows, unsigned int cols, unsigned int view_A)
 Stores constant in the triangular part of a matrix on the OpenCL device.
 
__kernel void gp_exp_quad_cov (const __global double *x, __global double *res, const double sigma_sq, const double neg_half_inv_l_sq, const int size, const int element_size)
 GPU part of calculation of squared exponential kernel.
 
__kernel void gp_exp_quad_cov_cross (const __global double *x1, const __global double *x2, __global double *res, const double sigma_sq, const double neg_half_inv_l_sq, const int size1, const int size2, const int element_size)
 GPU part of calculation of squared exponential kernel.
 
__kernel void gp_exponential_cov (const __global double *x, __global double *res, const double sigma_sq, const double neg_inv_l, const int size, const int element_size)
 GPU part of calculation of Matern exponential kernel.
 
__kernel void gp_exponential_cov_cross (const __global double *x1, const __global double *x2, __global double *res, const double sigma_sq, const double neg_inv_l, const int size1, const int size2, const int element_size)
 GPU part of calculation of Matern exponential kernel.
 
__kernel void gp_matern32_cov (const __global double *x, __global double *res, const double sigma_sq, const double root_3_inv_l, const int size, const int element_size)
 GPU part of calculation of Matern 3/2 kernel.
 
__kernel void gp_matern32_cov_cross (const __global double *x1, const __global double *x2, __global double *res, const double sigma_sq, const double root_3_inv_l, const int size1, const int size2, const int element_size)
 GPU part of calculation of Matern 3/2 kernel.
 
__kernel void gp_matern52_cov (const __global double *x, __global double *res, const double sigma_sq, const double root_5_inv_l, const double inv_l_sq_5_3, const int size, const int element_size)
 GPU part of calculation of Matern 5/2 kernel.
 
__kernel void gp_matern52_cov_cross (const __global double *x1, const __global double *x2, __global double *res, const double sigma_sq, const double root_5_inv_l, const double inv_l_sq_5_3, const int size1, const int size2, const int element_size)
 GPU part of calculation of Matern 5/2 kernel.
 
__kernel void indexing_rev (__global double *adj, const __global int *index, const __global double *res, int size)
 Increments adjoint of the indexing operation argument given the indices and adjoints of the indexing result.
 
__kernel void indexing_rev (__global double *adj, const __global int *index, const __global double *res, __local double *adj_loc, int index_size, int adj_size)
 Increments adjoint of the indexing operation argument given the indices and adjoints of the indexing result.
 
__kernel void inv_lower_tri_multiply (__global double *A, __global double *temp, const int A_rows, const int rows)
 Calculates B = C * A.
 
__kernel void matrix_multiply (const __global double *A, const __global double *B, __global double *C, const int M, const int N, const int K, unsigned int view_A, unsigned int view_B)
 Matrix multiplication on the OpenCL device.
 
__kernel void row_vector_matrix_multiply (const __global double *A, const __global double *B, __global double *R, const int N, const int K, unsigned int view_A, unsigned int view_B)
 Row vector-matrix multiplication R=A*B on the OpenCL device.
 
void merge (__global SCAL *A, __global SCAL *B, __global SCAL *res, int A_size, int B_size)
 Merges two sorted runs into a single sorted run of combined length.
 
int binary_search (__global SCAL *input, int start, int end, SCAL value)
 Searches for the index of the element that is larger than or equal to given value in given range.
 
__kernel void merge_step (__global SCAL *output, __global SCAL *input, int run_len, int size, int tasks)
 Merges sorted runs into longer sorted runs.
 
int get_sturm_count_tri (const __global double *diagonal, const __global double *subdiagonal_squared, const double shift, const int n)
 Calculates lower Sturm count of a tridiagonal matrix T - number of eigenvalues lower than shift.
 
void eigenvals_bisect (const __global double *diagonal, const __global double *subdiagonal_squared, double *low_res, double *high_res, const double min_eigval, const double max_eigval, const int n, const int i)
 Calculates i-th largest eigenvalue of tridiagonal matrix represented by a LDL decomposition using bisection.
 
int get_sturm_count_ldl (const __global double_d *l, const __global double_d *d, const double_d shift, const int n)
 Calculates Sturm count of a LDL decomposition of a tridiagonal matrix - number of eigenvalues larger or equal to shift.
 
void eigenvals_bisect_refine (const __global double_d *l, const __global double_d *d, double_d *low_res, double_d *high_res, const int n, const int i)
 Refines bounds on the i-th largest eigenvalue of a LDL decomposition using bisection.
 
__kernel void eigenvals (const __global double *diagonal, const __global double *subdiagonal_squared, const __global double_d *l, const __global double_d *d, __global double *eigval_global, __global double_d *shifted_low_global, __global double_d *shifted_high_global, const double min_eigval, const double max_eigval, const double shift, const char do_refine)
 Calculates eigenvalues of a tridiagonal matrix T and refines shifted eigenvalues using shifted LDL decomposition of T.
 
int get_twisted_factorization (const __global double_d *l, const __global double_d *d, double_d shift, __global double_d *l_plus, __global double_d *u_minus, __global double_d *s)
 Calculates shifted LDL and UDU factorizations.
 
void calculate_eigenvector (const __global double_d *l_plus, const __global double_d *u_minus, const __global double *subdiag, int twist_idx, __global double *eigenvectors)
 Calculates an eigenvector from twisted factorization T - shift * I = L+.
 
__kernel void get_eigenvectors (const __global double_d *l, const __global double_d *d, const __global double *subdiag, const __global double_d *shifted_eigvals, __global double_d *l_plus, __global double_d *u_minus, __global double_d *temp, __global double *eigenvectors)
 Calculates eigenvectors for (shifted) eigenvalues.
 
__kernel void multiply_transpose (const __global double *A, __global double *B, const int M, const int N)
 Matrix multiplication of the form A*A^T on the OpenCL device.
 
__kernel void neg_binomial_2_log_glm (__global double *logp_global, __global double *theta_derivative_global, __global double *theta_derivative_sum, __global double *phi_derivative_global, const __global int *y_global, const __global double *x, const __global double *alpha, const __global double *beta, const __global double *phi_global, const int N, const int M, const int is_y_vector, const int is_alpha_vector, const int is_phi_vector, const int need_theta_derivative, const int need_theta_derivative_sum, const int need_phi_derivative, const int need_phi_derivative_sum, const int need_logp1, const int need_logp2, const int need_logp3, const int need_logp4)
 GPU implementation of Generalized Linear Model (GLM) with Negative-Binomial-2 distribution and log link function.
 
__kernel void neg_rect_lower_tri_multiply (__global double *A, const __global double *temp, const int A_rows, const int rows)
 Calculates C = -B * A where B is rectangular and A is a lower triangular.
 
__kernel void ordered_logistic_glm (__global double *location_sum, __global double *logp_global, __global double *location_derivative, __global double *cuts_derivative, const __global int *y_global, const __global double *x, const __global double *beta, const __global double *cuts, const int N_instances, const int N_attributes, const int N_classes, const int is_y_vector, const int need_location_derivative, const int need_cuts_derivative)
 GPU implementation of ordinal regression Generalized Linear Model (GLM).
 
__kernel void ordered_logistic (__global double *logp_global, __global double *lambda_derivative, __global double *cuts_derivative, const __global int *y_global, const __global double *lambda_global, const __global double *cuts, const int N_instances, const int N_classes, const int is_y_vector, const int is_cuts_matrix, const int need_lambda_derivative, const int need_cuts_derivative)
 GPU implementation of ordinal regression.
 
__kernel void pack (__global double *A, __global double *B, unsigned int rows, unsigned int cols, unsigned int view)
 Packs a flat matrix to a packed triangular matrix.
 
__kernel void rep_matrix_rev (__global double *A_adj, __global double *B_adj, unsigned int B_rows, unsigned int B_cols, unsigned int view_B)
 Implements reverse pass of rep_matrix.
 
__kernel void tridiagonalization_householder (__global double *P, __global double *V, __global double *q_glob, const int P_rows, const int V_rows, const int j, const int k)
 Calculates householder vector and first element of the vector v.
 
__kernel void tridiagonalization_v_step_1 (const __global double *P, const __global double *V, __global double *Uu, __global double *Vu, const int P_rows, const int V_rows, const int k)
 Calculates first part of constructing the vector v: Uu = Pb * u and Vu = Vl * u.
 
__kernel void tridiagonalization_v_step_2 (const __global double *P, __global double *V, const __global double *Uu, const __global double *Vu, const int P_rows, const int V_rows, const int k, const int j)
 Second part in constructing vector v: v = Pb * u + V * Uu + U * Vu.
 
__kernel void tridiagonalization_v_step_3 (__global double *P, __global double *V, __global double *q, const int P_rows, const int V_rows, const int k, const int j)
 Third part in constructing vector v: v-=0.5*(v^T*u)*u, where u is the householder vector.
 
__kernel void unpack (__global double *B, __global double *A, unsigned int rows, unsigned int cols, unsigned int view)
 Unpacks a packed triangular matrix to a flat matrix.
 

Variables

const kernel_cl< out_buffer, in_buffer, int, int, int > add_batch ("add_batch", {indexing_helpers, add_batch_kernel_code})
 See the docs for add_batch() .
 
const kernel_cl< out_buffer, int, int > batch_identity ("batch_identity", {indexing_helpers, batch_identity_kernel_code})
 See the docs for batch_identity() .
 
const kernel_cl< out_buffer, out_buffer, out_buffer, out_buffer, out_buffer, in_buffer, in_buffer, in_buffer, int, int, int, int, int, int > categorical_logit_glm ("categorical_logit_glm", {categorical_logit_glm_kernel_code}, {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}})
 See the docs for categorical_logit_glm() .
 
const kernel_cl< in_out_buffer, in_out_buffer, in_buffer, in_buffer, int, int, int, int > categorical_logit_glm_beta_derivative ("categorical_logit_glm_beta_derivative", {categorical_logit_glm_beta_derivative_kernel_code})
 See the docs for categorical_logit_glm_beta_derivative() .
 
const kernel_cl< in_buffer, out_buffer, int, int, const double > check_symmetric ("is_symmetric", {indexing_helpers, is_symmetric_kernel_code})
 See the docs for check_symmetric() .
 
const kernel_cl< in_out_buffer, int > cholesky_decompose ("cholesky_decompose", {indexing_helpers, cholesky_decompose_kernel_code})
 See the docs for cholesky_decompose() .
 
const kernel_cl< in_out_buffer, in_out_buffer, int > diag_inv ("diag_inv", {indexing_helpers, diag_inv_kernel_code}, {{"THREAD_BLOCK_SIZE", 32}})
 See the docs for add() .
 
const kernel_cl< out_buffer, in_buffer, int > divide_columns_vec ("divide_columns_vec", {indexing_helpers, divide_columns_kernel_code})
 See the docs for divide_columns_vec() .
 
const kernel_cl< out_buffer, double, int, int, matrix_cl_viewfill_strict_tri ("fill_strict_tri", {indexing_helpers, view_kernel_helpers, fill_strict_tri_kernel_code})
 See the docs for fill_strict_tri_kernel_code() .
 
const kernel_cl< in_buffer, out_buffer, double, double, int, int > gp_exp_quad_cov ("gp_exp_quad_cov", {gp_exp_quad_cov_kernel_code})
 See the docs for gp_exp_quad_cov() .
 
const kernel_cl< in_buffer, in_buffer, out_buffer, double, double, int, int, int > gp_exp_quad_cov_cross ("gp_exp_quad_cov_cross", {gp_exp_quad_cov_cross_kernel_code})
 See the docs for gp_exp_quad_cov_cross() .
 
const kernel_cl< in_buffer, out_buffer, double, double, int, int > gp_exponential_cov ("gp_exponential_cov", {gp_exponential_cov_kernel_code})
 See the docs for gp_exponential_cov() .
 
const kernel_cl< in_buffer, in_buffer, out_buffer, double, double, int, int, int > gp_exponential_cov_cross ("gp_exponential_cov_cross", {gp_exponential_cov_cross_kernel_code})
 See the docs for gp_exponential_cov_cross() .
 
const kernel_cl< in_buffer, out_buffer, double, double, int, int > gp_matern32_cov ("gp_matern32_cov", {gp_matern32_cov_kernel_code})
 See the docs for gp_matern32_cov() .
 
const kernel_cl< in_buffer, in_buffer, out_buffer, double, double, int, int, int > gp_matern32_cov_cross ("gp_matern32_cov_cross", {gp_matern32_cov_cross_kernel_code})
 See the docs for gp_matern32_cov_cross() .
 
const kernel_cl< in_buffer, out_buffer, double, double, double, int, int > gp_matern52_cov ("gp_matern52_cov", {gp_matern52_cov_kernel_code})
 See the docs for gp_matern52_cov() .
 
const kernel_cl< in_buffer, in_buffer, out_buffer, double, double, double, int, int, int > gp_matern52_cov_cross ("gp_matern52_cov_cross", {gp_matern52_cov_cross_kernel_code})
 See the docs for gp_matern52_cov_cross() .
 
static const std::string indexing_helpers
 Defines helper macros for common matrix indexing operations.
 
static const std::string thread_block_helpers
 Defines a helper macro for kernels with 2D local size.
 
const kernel_cl< in_out_buffer, in_buffer, in_buffer, int > indexing_rev_global_atomic ("indexing_rev", {atomic_add_double_device_function, indexing_rev_global_atomic_kernel_code})
 See the docs for add_batch() .
 
const kernel_cl< in_out_buffer, in_buffer, in_buffer, cl::LocalSpaceArg, int, int > indexing_rev_local_atomic ("indexing_rev", {atomic_add_double_device_function, indexing_rev_local_atomic_kernel_code})
 See the docs for add_batch() .
 
const kernel_cl< in_out_buffer, in_buffer, in_buffer, cl::LocalSpaceArg, int, int > indexing_rev_local_independent ("indexing_rev", {atomic_add_double_device_function, indexing_rev_local_independent_kernel_code})
 See the docs for add_batch() .
 
const kernel_cl< in_buffer, out_buffer, int, int > inv_lower_tri_multiply ("inv_lower_tri_multiply", {thread_block_helpers, inv_lower_tri_multiply_kernel_code}, {{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 8}})
 See the docs for add() .
 
const kernel_cl< in_buffer, in_buffer, out_buffer, int, int, int, matrix_cl_view, matrix_cl_viewmatrix_multiply ("matrix_multiply", {thread_block_helpers, view_kernel_helpers, matrix_multiply_kernel_code}, {{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 8}})
 See the docs for matrix_multiply() .
 
const kernel_cl< in_buffer, in_buffer, out_buffer, int, int, matrix_cl_view, matrix_cl_viewrow_vector_matrix_multiply ("row_vector_matrix_multiply", {view_kernel_helpers, row_vector_matrix_multiply_kernel_code}, {{"LOCAL_SIZE_", 64}, {"REDUCTION_STEP_SIZE", 4}})
 See the docs for row_vector_matrix_multiply() .
 
const kernel_cl< in_buffer, in_buffer, in_buffer, in_buffer, out_buffer, out_buffer, out_buffer, double, double, double, char > eigenvals ("eigenvals", {stan::math::internal::double_d_src, eigenvals_bisect_kernel_code})
 
const kernel_cl< in_buffer, in_buffer, in_buffer, in_buffer, in_out_buffer, in_out_buffer, in_out_buffer, out_bufferget_eigenvectors ("get_eigenvectors", {stan::math::internal::double_d_src, get_eigenvectors_kernel_code})
 
const kernel_cl< in_buffer, out_buffer, int, int > multiply_transpose ("multiply_transpose", {thread_block_helpers, multiply_transpose_kernel_code}, {{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 4}})
 See the docs for add() .
 
const kernel_cl< out_buffer, out_buffer, out_buffer, out_buffer, in_buffer, in_buffer, in_buffer, in_buffer, in_buffer, int, int, int, int, int, int, int, int, int, int, int, int, int > neg_binomial_2_log_glm ("neg_binomial_2_log_glm", {digamma_device_function, log1p_exp_device_function, neg_binomial_2_log_glm_kernel_code}, {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}})
 See the docs for neg_binomial_2_log_glm_lpmf() .
 
const kernel_cl< in_out_buffer, in_buffer, int, int > neg_rect_lower_tri_multiply ("neg_rect_lower_tri_multiply", {thread_block_helpers, neg_rect_lower_tri_multiply_kernel_code}, {{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 8}})
 See the docs for neg_rect_lower_tri_multiply() .
 
const kernel_cl< out_buffer, out_buffer, out_buffer, out_buffer, in_buffer, in_buffer, in_buffer, in_buffer, int, int, int, int, int, int > ordered_logistic_glm ("ordered_logistic_glm", {log1p_exp_device_function, log1m_exp_device_function, inv_logit_device_function, ordered_logistic_glm_kernel_code}, {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}})
 See the docs for ordered_logistic_glm() .
 
const kernel_cl< out_buffer, out_buffer, out_buffer, in_buffer, in_buffer, in_buffer, int, int, int, int, int, int > ordered_logistic ("ordered_logistic", {log1p_exp_device_function, log1m_exp_device_function, inv_logit_device_function, ordered_logistic_kernel_code}, {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}})
 See the docs for ordered_logistic() .
 
const kernel_cl< out_buffer, in_buffer, int, int, matrix_cl_viewpack ("pack", {indexing_helpers, pack_kernel_code})
 See the docs for pack() .
 
const kernel_cl< in_out_buffer, in_buffer, int, int, matrix_cl_viewrep_matrix_rev ("rep_matrix_rev", {view_kernel_helpers, rep_matrix_rev_kernel_code})
 See the docs for rep_matrix_rev() .
 
const kernel_cl< in_out_buffer, in_out_buffer, out_buffer, int, int, int, int > tridiagonalization_householder ("tridiagonalization_householder", {tridiagonalization_householder_kernel_code}, {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 1024}})
 
const kernel_cl< in_buffer, in_buffer, out_buffer, out_buffer, int, int, int > tridiagonalization_v_step_1 ("tridiagonalization_v_step_1", {tridiagonalization_v_step_1_kernel_code}, {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}})
 
const kernel_cl< in_buffer, out_buffer, in_buffer, in_buffer, int, int, int, int > tridiagonalization_v_step_2 ("tridiagonalization_v_step_2", {tridiagonalization_v_step_2_kernel_code}, {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}})
 
const kernel_cl< in_out_buffer, in_out_buffer, out_buffer, int, int, int, int > tridiagonalization_v_step_3 ("tridiagonalization_v_step_3", {tridiagonalization_v_step_3_kernel_code}, {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 1024}})
 
const kernel_cl< out_buffer, in_buffer, int, int, matrix_cl_viewunpack ("unpack", {indexing_helpers, unpack_kernel_code})
 See the docs for unpack() .