math/opencl_2kernels_2cumulative__sum_8hpp_source.html

#ifndef STAN_MATH_OPENCL_KERNELS_CUMULATIVE_SUM_HPP

#define STAN_MATH_OPENCL_KERNELS_CUMULATIVE_SUM_HPP

#ifdef STAN_OPENCL


#include <stan/math/opencl/kernel_cl.hpp>

#include <stan/math/opencl/buffer_types.hpp>

#include <stan/math/opencl/matrix_cl_view.hpp>

#include <string>


namespace stan {

namespace math {

namespace opencl_kernels {


// \cond

static constexpr const char *cumulative_sum1_kernel_code = STRINGIFY(

    // \endcond

    __kernel void cumulative_sum1(__global SCAL *out_wgs,

                                  __global SCAL *out_threads, __global SCAL *in,

                                  int size) {

      const int gid = get_global_id(0);

      const int lid = get_local_id(0);

      const int lsize = get_local_size(0);

      const int wg_id = get_group_id(0);

      const int gsize = get_global_size(0);


      int start = (int)((long)gid * size / gsize);      // NOLINT

      int end = (int)((long)(gid + 1) * size / gsize);  // NOLINT

      __local SCAL local_storage[LOCAL_SIZE_];


      SCAL acc = 0;

      if (start != end) {

        acc = in[start];

        for (int i = start + 1; i < end; i++) {

          acc += in[i];

        }

      }

      for (int step = 1; step < lsize; step *= REDUCTION_STEP_SIZE) {

        local_storage[lid] = acc;

        barrier(CLK_LOCAL_MEM_FENCE);

        for (int i = 1; i < REDUCTION_STEP_SIZE && step * i <= lid; i++) {

          acc += local_storage[lid - step * i];

        }

        barrier(CLK_LOCAL_MEM_FENCE);

      }

      out_threads[gid] = acc;

      if (lid == LOCAL_SIZE_ - 1) {

        out_wgs[wg_id] = acc;

      }

    }

    // \cond

);

// \endcond


// \cond

static constexpr const char *cumulative_sum2_kernel_code = STRINGIFY(

    // \endcond

    __kernel void cumulative_sum2(__global SCAL *data, int size) {

      const int gid = get_global_id(0);

      const int gsize = get_global_size(0);


      int start = (int)((long)gid * size / gsize);      // NOLINT

      int end = (int)((long)(gid + 1) * size / gsize);  // NOLINT

      __local SCAL local_storage[LOCAL_SIZE_];


      SCAL acc;

      if (start == end) {

        acc = 0;

      } else {

        acc = data[start];

        for (int i = start + 1; i < end; i++) {

          acc += data[i];

        }

      }

      local_storage[gid] = acc;

      barrier(CLK_LOCAL_MEM_FENCE);

      for (int step = 1; step < gsize; step *= REDUCTION_STEP_SIZE) {

        for (int i = 1; i < REDUCTION_STEP_SIZE && step * i <= gid; i++) {

          acc += local_storage[gid - step * i];

        }

        barrier(CLK_LOCAL_MEM_FENCE);

        local_storage[gid] = acc;

        barrier(CLK_LOCAL_MEM_FENCE);

      }

      if (start != end) {

        if (gid == 0) {

          acc = 0;

        } else {

          acc = local_storage[gid - 1];

        }

        for (int i = start; i < end; i++) {

          acc += data[i];

          data[i] = acc;

        }

      }

    }

    // \cond

);

// \endcond


// \cond

static constexpr const char *cumulative_sum3_kernel_code = STRINGIFY(

    // \endcond

    __kernel void cumulative_sum3(__global SCAL *out, __global SCAL *in_data,

                                  __global SCAL *in_threads,

                                  __global SCAL *in_wgs, int size) {

      const int gid = get_global_id(0);

      const int lid = get_local_id(0);

      const int lsize = get_local_size(0);

      const int wg_id = get_group_id(0);

      const int gsize = get_global_size(0);


      int start = (int)((long)gid * size / gsize);      // NOLINT

      int end = (int)((long)(gid + 1) * size / gsize);  // NOLINT

      __local SCAL local_storage[LOCAL_SIZE_];


      SCAL acc = 0;

      if (wg_id != 0) {

        acc = in_wgs[wg_id - 1];

      }

      if (lid != 0) {

        acc += in_threads[gid - 1];

      }

      for (int i = start; i < end; i++) {

        acc += in_data[i];

        out[i] = acc;

      }

    }

    // \cond

);

// \endcond


template <typename Scalar, typename = void>

struct cumulative_sum {};


template <typename T>

struct cumulative_sum<double, T> {

  static const kernel_cl<out_buffer, out_buffer, in_buffer, int> kernel1;

  static const kernel_cl<in_out_buffer, int> kernel2;

  static const kernel_cl<out_buffer, in_buffer, in_buffer, in_buffer, int>

      kernel3;

};

template <typename T>

struct cumulative_sum<int, T> {

  static const kernel_cl<out_buffer, out_buffer, in_buffer, int> kernel1;

  static const kernel_cl<in_out_buffer, int> kernel2;

  static const kernel_cl<out_buffer, in_buffer, in_buffer, in_buffer, int>

      kernel3;

};


template <typename T>

const kernel_cl<out_buffer, out_buffer, in_buffer, int>

    cumulative_sum<double, T>::kernel1("cumulative_sum1",

                                       {"#define SCAL double\n",

                                        cumulative_sum1_kernel_code},

                                       {{"REDUCTION_STEP_SIZE", 4},

                                        {"LOCAL_SIZE_", 16}});

template <typename T>

const kernel_cl<out_buffer, out_buffer, in_buffer, int>

    cumulative_sum<int, T>::kernel1(

        "cumulative_sum1", {"#define SCAL int\n", cumulative_sum1_kernel_code},

        {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 16}});


template <typename T>

const kernel_cl<in_out_buffer, int> cumulative_sum<double, T>::kernel2(

    "cumulative_sum2", {"#define SCAL double\n", cumulative_sum2_kernel_code},

    {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 1024}});

template <typename T>

const kernel_cl<in_out_buffer, int> cumulative_sum<int, T>::kernel2(

    "cumulative_sum2", {"#define SCAL int\n", cumulative_sum2_kernel_code},

    {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 1024}});


template <typename T>

const kernel_cl<out_buffer, in_buffer, in_buffer, in_buffer, int>

    cumulative_sum<double, T>::kernel3("cumulative_sum3",

                                       {"#define SCAL double\n",

                                        cumulative_sum3_kernel_code},

                                       {{"REDUCTION_STEP_SIZE", 4},

                                        {"LOCAL_SIZE_", 16}});

template <typename T>

const kernel_cl<out_buffer, in_buffer, in_buffer, in_buffer, int>

    cumulative_sum<int, T>::kernel3(

        "cumulative_sum3", {"#define SCAL int\n", cumulative_sum3_kernel_code},

        {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 16}});


}  // namespace opencl_kernels

}  // namespace math

}  // namespace stan

#endif

#endif

buffer_types.hpp

stan::math::opencl_kernels::cumulative_sum3
__kernel void cumulative_sum3(__global SCAL *out, __global SCAL *in_data, __global SCAL *in_threads, __global SCAL *in_wgs, int size)
Third kernel of the cumulative sum implementation.
Definition cumulative_sum.hpp:135

stan::math::opencl_kernels::cumulative_sum1
__kernel void cumulative_sum1(__global SCAL *out_wgs, __global SCAL *out_threads, __global SCAL *in, int size)
First kernel of the cumulative sum implementation.
Definition cumulative_sum.hpp:27

stan::math::opencl_kernels::cumulative_sum2
__kernel void cumulative_sum2(__global SCAL *data, int size)
Second kernel of the cumulative sum implementation.
Definition cumulative_sum.hpp:75

stan::math::size
int64_t size(const T &m)
Returns the size (number of the elements) of a matrix_cl or var_value<matrix_cl<T>>.
Definition size.hpp:19

kernel_cl.hpp

matrix_cl_view.hpp

stan::math::step
T step(const T &y)
The step, or Heaviside, function.
Definition step.hpp:31

stan
The lgamma implementation in stan-math is based on either the reentrant safe lgamma_r implementation ...
Definition unit_vector_constrain.hpp:15

STRINGIFY
#define STRINGIFY(...)
Definition stringify.hpp:9

stan::math::opencl_kernels::cumulative_sum< double, T >::kernel3
static const kernel_cl< out_buffer, in_buffer, in_buffer, in_buffer, int > kernel3
Definition cumulative_sum.hpp:175

stan::math::opencl_kernels::cumulative_sum< double, T >::kernel1
static const kernel_cl< out_buffer, out_buffer, in_buffer, int > kernel1
Definition cumulative_sum.hpp:172

stan::math::opencl_kernels::cumulative_sum< double, T >::kernel2
static const kernel_cl< in_out_buffer, int > kernel2
Definition cumulative_sum.hpp:173

stan::math::opencl_kernels::cumulative_sum< int, T >::kernel1
static const kernel_cl< out_buffer, out_buffer, in_buffer, int > kernel1
Definition cumulative_sum.hpp:179

stan::math::opencl_kernels::cumulative_sum< int, T >::kernel2
static const kernel_cl< in_out_buffer, int > kernel2
Definition cumulative_sum.hpp:180

stan::math::opencl_kernels::cumulative_sum< int, T >::kernel3
static const kernel_cl< out_buffer, in_buffer, in_buffer, in_buffer, int > kernel3
Definition cumulative_sum.hpp:182

stan::math::opencl_kernels::cumulative_sum
struct containing cumulative_sum kernels, grouped by scalar type.
Definition cumulative_sum.hpp:168

stan::math::opencl_kernels::kernel_cl
Creates functor for kernels.
Definition kernel_cl.hpp:174