1#ifndef STAN_MATH_OPENCL_KERNELS_CUMULATIVE_SUM_HPP
2#define STAN_MATH_OPENCL_KERNELS_CUMULATIVE_SUM_HPP
12namespace opencl_kernels {
15static constexpr const char *cumulative_sum1_kernel_code =
STRINGIFY(
28 __global SCAL *out_threads, __global SCAL *in,
30 const int gid = get_global_id(0);
31 const int lid = get_local_id(0);
32 const int lsize = get_local_size(0);
33 const int wg_id = get_group_id(0);
34 const int gsize = get_global_size(0);
36 int start = (int)((
long)gid *
size / gsize);
37 int end = (int)((
long)(gid + 1) *
size / gsize);
38 __local SCAL local_storage[LOCAL_SIZE_];
43 for (
int i = start + 1; i < end; i++) {
47 for (
int step = 1;
step < lsize;
step *= REDUCTION_STEP_SIZE) {
48 local_storage[lid] = acc;
49 barrier(CLK_LOCAL_MEM_FENCE);
50 for (
int i = 1; i < REDUCTION_STEP_SIZE &&
step * i <= lid; i++) {
51 acc += local_storage[lid -
step * i];
53 barrier(CLK_LOCAL_MEM_FENCE);
55 out_threads[gid] = acc;
56 if (lid == LOCAL_SIZE_ - 1) {
65static constexpr const char *cumulative_sum2_kernel_code =
STRINGIFY(
76 const int gid = get_global_id(0);
77 const int gsize = get_global_size(0);
79 int start = (int)((
long)gid *
size / gsize);
80 int end = (int)((
long)(gid + 1) *
size / gsize);
81 __local SCAL local_storage[LOCAL_SIZE_];
88 for (
int i = start + 1; i < end; i++) {
92 local_storage[gid] = acc;
93 barrier(CLK_LOCAL_MEM_FENCE);
94 for (
int step = 1;
step < gsize;
step *= REDUCTION_STEP_SIZE) {
95 for (
int i = 1; i < REDUCTION_STEP_SIZE &&
step * i <= gid; i++) {
96 acc += local_storage[gid -
step * i];
98 barrier(CLK_LOCAL_MEM_FENCE);
99 local_storage[gid] = acc;
100 barrier(CLK_LOCAL_MEM_FENCE);
106 acc = local_storage[gid - 1];
108 for (
int i = start; i < end; i++) {
119static constexpr const char *cumulative_sum3_kernel_code =
STRINGIFY(
136 __global SCAL *in_threads,
137 __global SCAL *in_wgs,
int size) {
138 const int gid = get_global_id(0);
139 const int lid = get_local_id(0);
140 const int lsize = get_local_size(0);
141 const int wg_id = get_group_id(0);
142 const int gsize = get_global_size(0);
144 int start = (int)((
long)gid *
size / gsize);
145 int end = (int)((
long)(gid + 1) *
size / gsize);
146 __local SCAL local_storage[LOCAL_SIZE_];
150 acc = in_wgs[wg_id - 1];
153 acc += in_threads[gid - 1];
155 for (
int i = start; i < end; i++) {
167template <
typename Scalar,
typename =
void>
188 {
"#define SCAL double\n",
189 cumulative_sum1_kernel_code},
190 {{
"REDUCTION_STEP_SIZE", 4},
191 {
"LOCAL_SIZE_", 16}});
193const kernel_cl<out_buffer, out_buffer, in_buffer, int>
194 cumulative_sum<int, T>::kernel1(
195 "cumulative_sum1", {
"#define SCAL int\n", cumulative_sum1_kernel_code},
196 {{
"REDUCTION_STEP_SIZE", 4}, {
"LOCAL_SIZE_", 16}});
199const kernel_cl<in_out_buffer, int> cumulative_sum<double, T>::kernel2(
200 "cumulative_sum2", {
"#define SCAL double\n", cumulative_sum2_kernel_code},
201 {{
"REDUCTION_STEP_SIZE", 4}, {
"LOCAL_SIZE_", 1024}});
203const kernel_cl<in_out_buffer, int> cumulative_sum<int, T>::kernel2(
204 "cumulative_sum2", {
"#define SCAL int\n", cumulative_sum2_kernel_code},
205 {{
"REDUCTION_STEP_SIZE", 4}, {
"LOCAL_SIZE_", 1024}});
208const kernel_cl<out_buffer, in_buffer, in_buffer, in_buffer, int>
209 cumulative_sum<double, T>::kernel3(
"cumulative_sum3",
210 {
"#define SCAL double\n",
211 cumulative_sum3_kernel_code},
212 {{
"REDUCTION_STEP_SIZE", 4},
213 {
"LOCAL_SIZE_", 16}});
215const kernel_cl<out_buffer, in_buffer, in_buffer, in_buffer, int>
216 cumulative_sum<int, T>::kernel3(
217 "cumulative_sum3", {
"#define SCAL int\n", cumulative_sum3_kernel_code},
218 {{
"REDUCTION_STEP_SIZE", 4}, {
"LOCAL_SIZE_", 16}});
__kernel void cumulative_sum3(__global SCAL *out, __global SCAL *in_data, __global SCAL *in_threads, __global SCAL *in_wgs, int size)
Third kernel of the cumulative sum implementation.
__kernel void cumulative_sum1(__global SCAL *out_wgs, __global SCAL *out_threads, __global SCAL *in, int size)
First kernel of the cumulative sum implementation.
__kernel void cumulative_sum2(__global SCAL *data, int size)
Second kernel of the cumulative sum implementation.
int64_t size(const T &m)
Returns the size (number of the elements) of a matrix_cl or var_value<matrix_cl<T>>.
T step(const T &y)
The step, or Heaviside, function.
The lgamma implementation in stan-math is based on either the reentrant safe lgamma_r implementation ...
static const kernel_cl< out_buffer, in_buffer, in_buffer, in_buffer, int > kernel3
static const kernel_cl< out_buffer, out_buffer, in_buffer, int > kernel1
static const kernel_cl< in_out_buffer, int > kernel2
static const kernel_cl< out_buffer, out_buffer, in_buffer, int > kernel1
static const kernel_cl< in_out_buffer, int > kernel2
static const kernel_cl< out_buffer, in_buffer, in_buffer, in_buffer, int > kernel3
struct containing cumulative_sum kernels, grouped by scalar type.
Creates functor for kernels.