Automatic Differentiation
 
Loading...
Searching...
No Matches
cumulative_sum.hpp
Go to the documentation of this file.
1#ifndef STAN_MATH_OPENCL_KERNELS_CUMULATIVE_SUM_HPP
2#define STAN_MATH_OPENCL_KERNELS_CUMULATIVE_SUM_HPP
3#ifdef STAN_OPENCL
4
8#include <string>
9
10namespace stan {
11namespace math {
12namespace opencl_kernels {
13
14// \cond
15static constexpr const char *cumulative_sum1_kernel_code = STRINGIFY(
16 // \endcond
27 __kernel void cumulative_sum1(__global SCAL *out_wgs,
28 __global SCAL *out_threads, __global SCAL *in,
29 int size) {
30 const int gid = get_global_id(0);
31 const int lid = get_local_id(0);
32 const int lsize = get_local_size(0);
33 const int wg_id = get_group_id(0);
34 const int gsize = get_global_size(0);
35
36 int start = (int)((long)gid * size / gsize); // NOLINT
37 int end = (int)((long)(gid + 1) * size / gsize); // NOLINT
38 __local SCAL local_storage[LOCAL_SIZE_];
39
40 SCAL acc = 0;
41 if (start != end) {
42 acc = in[start];
43 for (int i = start + 1; i < end; i++) {
44 acc += in[i];
45 }
46 }
47 for (int step = 1; step < lsize; step *= REDUCTION_STEP_SIZE) {
48 local_storage[lid] = acc;
49 barrier(CLK_LOCAL_MEM_FENCE);
50 for (int i = 1; i < REDUCTION_STEP_SIZE && step * i <= lid; i++) {
51 acc += local_storage[lid - step * i];
52 }
53 barrier(CLK_LOCAL_MEM_FENCE);
54 }
55 out_threads[gid] = acc;
56 if (lid == LOCAL_SIZE_ - 1) {
57 out_wgs[wg_id] = acc;
58 }
59 }
60 // \cond
61);
62// \endcond
63
64// \cond
65static constexpr const char *cumulative_sum2_kernel_code = STRINGIFY(
66 // \endcond
75 __kernel void cumulative_sum2(__global SCAL *data, int size) {
76 const int gid = get_global_id(0);
77 const int gsize = get_global_size(0);
78
79 int start = (int)((long)gid * size / gsize); // NOLINT
80 int end = (int)((long)(gid + 1) * size / gsize); // NOLINT
81 __local SCAL local_storage[LOCAL_SIZE_];
82
83 SCAL acc;
84 if (start == end) {
85 acc = 0;
86 } else {
87 acc = data[start];
88 for (int i = start + 1; i < end; i++) {
89 acc += data[i];
90 }
91 }
92 local_storage[gid] = acc;
93 barrier(CLK_LOCAL_MEM_FENCE);
94 for (int step = 1; step < gsize; step *= REDUCTION_STEP_SIZE) {
95 for (int i = 1; i < REDUCTION_STEP_SIZE && step * i <= gid; i++) {
96 acc += local_storage[gid - step * i];
97 }
98 barrier(CLK_LOCAL_MEM_FENCE);
99 local_storage[gid] = acc;
100 barrier(CLK_LOCAL_MEM_FENCE);
101 }
102 if (start != end) {
103 if (gid == 0) {
104 acc = 0;
105 } else {
106 acc = local_storage[gid - 1];
107 }
108 for (int i = start; i < end; i++) {
109 acc += data[i];
110 data[i] = acc;
111 }
112 }
113 }
114 // \cond
115);
116// \endcond
117
118// \cond
119static constexpr const char *cumulative_sum3_kernel_code = STRINGIFY(
120 // \endcond
135 __kernel void cumulative_sum3(__global SCAL *out, __global SCAL *in_data,
136 __global SCAL *in_threads,
137 __global SCAL *in_wgs, int size) {
138 const int gid = get_global_id(0);
139 const int lid = get_local_id(0);
140 const int lsize = get_local_size(0);
141 const int wg_id = get_group_id(0);
142 const int gsize = get_global_size(0);
143
144 int start = (int)((long)gid * size / gsize); // NOLINT
145 int end = (int)((long)(gid + 1) * size / gsize); // NOLINT
146 __local SCAL local_storage[LOCAL_SIZE_];
147
148 SCAL acc = 0;
149 if (wg_id != 0) {
150 acc = in_wgs[wg_id - 1];
151 }
152 if (lid != 0) {
153 acc += in_threads[gid - 1];
154 }
155 for (int i = start; i < end; i++) {
156 acc += in_data[i];
157 out[i] = acc;
158 }
159 }
160 // \cond
161);
162// \endcond
163
167template <typename Scalar, typename = void>
169
170template <typename T>
171struct cumulative_sum<double, T> {
176};
177template <typename T>
178struct cumulative_sum<int, T> {
183};
184
185template <typename T>
187 cumulative_sum<double, T>::kernel1("cumulative_sum1",
188 {"#define SCAL double\n",
189 cumulative_sum1_kernel_code},
190 {{"REDUCTION_STEP_SIZE", 4},
191 {"LOCAL_SIZE_", 16}});
192template <typename T>
193const kernel_cl<out_buffer, out_buffer, in_buffer, int>
194 cumulative_sum<int, T>::kernel1(
195 "cumulative_sum1", {"#define SCAL int\n", cumulative_sum1_kernel_code},
196 {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 16}});
197
198template <typename T>
199const kernel_cl<in_out_buffer, int> cumulative_sum<double, T>::kernel2(
200 "cumulative_sum2", {"#define SCAL double\n", cumulative_sum2_kernel_code},
201 {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 1024}});
202template <typename T>
203const kernel_cl<in_out_buffer, int> cumulative_sum<int, T>::kernel2(
204 "cumulative_sum2", {"#define SCAL int\n", cumulative_sum2_kernel_code},
205 {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 1024}});
206
207template <typename T>
208const kernel_cl<out_buffer, in_buffer, in_buffer, in_buffer, int>
209 cumulative_sum<double, T>::kernel3("cumulative_sum3",
210 {"#define SCAL double\n",
211 cumulative_sum3_kernel_code},
212 {{"REDUCTION_STEP_SIZE", 4},
213 {"LOCAL_SIZE_", 16}});
214template <typename T>
215const kernel_cl<out_buffer, in_buffer, in_buffer, in_buffer, int>
216 cumulative_sum<int, T>::kernel3(
217 "cumulative_sum3", {"#define SCAL int\n", cumulative_sum3_kernel_code},
218 {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 16}});
219
220} // namespace opencl_kernels
221} // namespace math
222} // namespace stan
223#endif
224#endif
__kernel void cumulative_sum3(__global SCAL *out, __global SCAL *in_data, __global SCAL *in_threads, __global SCAL *in_wgs, int size)
Third kernel of the cumulative sum implementation.
__kernel void cumulative_sum1(__global SCAL *out_wgs, __global SCAL *out_threads, __global SCAL *in, int size)
First kernel of the cumulative sum implementation.
__kernel void cumulative_sum2(__global SCAL *data, int size)
Second kernel of the cumulative sum implementation.
int64_t size(const T &m)
Returns the size (number of the elements) of a matrix_cl or var_value<matrix_cl<T>>.
Definition size.hpp:19
T step(const T &y)
The step, or Heaviside, function.
Definition step.hpp:31
The lgamma implementation in stan-math is based on either the reentrant safe lgamma_r implementation ...
#define STRINGIFY(...)
Definition stringify.hpp:9
static const kernel_cl< out_buffer, in_buffer, in_buffer, in_buffer, int > kernel3
static const kernel_cl< out_buffer, out_buffer, in_buffer, int > kernel1
static const kernel_cl< in_out_buffer, int > kernel2
static const kernel_cl< out_buffer, out_buffer, in_buffer, int > kernel1
static const kernel_cl< in_out_buffer, int > kernel2
static const kernel_cl< out_buffer, in_buffer, in_buffer, in_buffer, int > kernel3
struct containing cumulative_sum kernels, grouped by scalar type.
Creates functor for kernels.