math/opencl_2kernels_2categorical__logit__glm__lpmf_8hpp_source.html

#ifndef STAN_MATH_OPENCL_KERNELS_CATEGORICAL_LOGIT_GLM_LPMF_HPP

#define STAN_MATH_OPENCL_KERNELS_CATEGORICAL_LOGIT_GLM_LPMF_HPP

#ifdef STAN_OPENCL


#include <stan/math/opencl/kernel_cl.hpp>

#include <string>


namespace stan {

namespace math {

namespace opencl_kernels {


// \cond

static constexpr const char* categorical_logit_glm_kernel_code = STRINGIFY(

    // \endcond

    __kernel void categorical_logit_glm(

        __global double* logp_global, __global double* exp_lin_global,

        __global double* inv_sum_exp_lin_global,

        __global double* neg_softmax_lin_global,

        __global double* alpha_derivative, const __global int* y_global,

        const __global double* x_beta_global,

        const __global double* alpha_global, const int N_instances,

        const int N_attributes, const int N_classes, const int is_y_vector,

        const int need_alpha_derivative,

        const int need_neg_softmax_lin_global) {

      const int gid = get_global_id(0);

      const int lid = get_local_id(0);

      const int lsize = get_local_size(0);

      const int wg_id = get_group_id(0);

      const int ngroups = get_num_groups(0);


      __local double local_storage[LOCAL_SIZE_];


      double logp = 0;

      double inv_sum_exp_lin;

      int class_idx = -1;

      // Most calculations only happen for relevant data within next if.

      // Exceptions are reductions between threads that need barriers.

      if (gid < N_instances) {

        double lin_max = -INFINITY;

        for (int i = 0; i < N_classes; i++) {

          double lin = x_beta_global[i * N_instances + gid] + alpha_global[i];

          if (lin > lin_max) {

            lin_max = lin;

          }

        }

        double alpha = alpha_global[gid];

        double sum_exp_lin = 0;

        for (int i = 0; i < N_classes; i++) {

          double lin = x_beta_global[i * N_instances + gid] + alpha_global[i];

          double exp_lin = exp(lin - lin_max);

          sum_exp_lin += exp_lin;

          exp_lin_global[i * N_instances + gid] = exp_lin;

        }

        inv_sum_exp_lin = 1 / sum_exp_lin;

        inv_sum_exp_lin_global[gid] = inv_sum_exp_lin;


        class_idx = y_global[gid * is_y_vector] - 1;

        if (class_idx < 0 || class_idx > N_classes) {

          logp = NAN;

        } else {

          logp = log(inv_sum_exp_lin) - lin_max

                 + x_beta_global[class_idx * N_instances + gid]

                 + alpha_global[class_idx];

        }

      }

      barrier(CLK_GLOBAL_MEM_FENCE);

      double neg_softmax_lin_sum = 0;

      if (need_alpha_derivative || need_neg_softmax_lin_global) {

        for (int i = 0; i < N_classes; i++) {

          double neg_softmax_lin = 0;

          if (gid < N_instances) {

            int idx = i * N_instances + gid;

            neg_softmax_lin = -exp_lin_global[idx] * inv_sum_exp_lin;

            if (need_neg_softmax_lin_global) {

              neg_softmax_lin_global[idx] = neg_softmax_lin;

            }

          }

          if (need_alpha_derivative) {

            local_storage[lid] = neg_softmax_lin + (class_idx == i);

            barrier(CLK_LOCAL_MEM_FENCE);

            for (int step = lsize / REDUCTION_STEP_SIZE; step > 0;

                 step /= REDUCTION_STEP_SIZE) {

              if (lid < step) {

                for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {

                  local_storage[lid] += local_storage[lid + step * i];

                }

              }

              barrier(CLK_LOCAL_MEM_FENCE);

            }

            if (lid == 0) {

              alpha_derivative[i + wg_id * N_classes] = local_storage[0];

            }

            barrier(CLK_LOCAL_MEM_FENCE);

          }

        }

      }

      // Sum logp, calculated by different threads.

      // Since we can't sum between different work groups, we emit one number

      // per work group. These must be summed on CPU for final result.

      local_storage[lid] = logp;

      barrier(CLK_LOCAL_MEM_FENCE);

      for (int step = lsize / REDUCTION_STEP_SIZE; step > 0;

           step /= REDUCTION_STEP_SIZE) {

        if (lid < step) {

          for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {

            local_storage[lid] += local_storage[lid + step * i];

          }

        }

        barrier(CLK_LOCAL_MEM_FENCE);

      }

      if (lid == 0) {

        logp_global[wg_id] = local_storage[0];

      }

    }

    // \cond

);

// \endcond


const kernel_cl<out_buffer, out_buffer, out_buffer, out_buffer, out_buffer,

                in_buffer, in_buffer, in_buffer, int, int, int, int, int, int>

    categorical_logit_glm("categorical_logit_glm",

                          {categorical_logit_glm_kernel_code},

                          {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}});


// \cond

static const std::string categorical_logit_glm_beta_derivative_kernel_code

    = STRINGIFY(

        // \endcond

        __kernel void categorical_logit_glm_beta_derivative(

            __global double* beta_derivative, __global double* temp,

            const __global int* y, const __global double* x,

            const int N_instances, const int N_attributes, const int N_classes,

            const int is_y_vector) {

          const int gid = get_global_id(0);

          const int lid = get_local_id(0);

          const int lsize = get_local_size(0);

          const int wg_id = get_group_id(0);


          for (int i = 0; i < N_classes; i++) {

            temp[gid * N_classes + i] = 0;

          }

          for (int i = lid; i < N_instances; i += lsize) {

            int pos = y[i * is_y_vector] - 1;

            temp[gid * N_classes + pos] += x[wg_id * N_instances + i];

          }

          barrier(CLK_GLOBAL_MEM_FENCE);

          for (int i = lid; i < N_classes; i += lsize) {

            double res = 0;

            for (int j = 0; j < lsize; j++) {

              res += temp[(wg_id * lsize + j) * N_classes + i];

            }

            beta_derivative[i * N_attributes + wg_id] += res;

          }

        }

        // \cond

    );  // NOLINT

// \endcond


const kernel_cl<in_out_buffer, in_out_buffer, in_buffer, in_buffer, int, int,

                int, int>

    categorical_logit_glm_beta_derivative(

        "categorical_logit_glm_beta_derivative",

        {categorical_logit_glm_beta_derivative_kernel_code});


}  // namespace opencl_kernels


}  // namespace math

}  // namespace stan

#endif

#endif

stan::math::opencl_kernels::categorical_logit_glm
const kernel_cl< out_buffer, out_buffer, out_buffer, out_buffer, out_buffer, in_buffer, in_buffer, in_buffer, int, int, int, int, int, int > categorical_logit_glm("categorical_logit_glm", {categorical_logit_glm_kernel_code}, {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}})
See the docs for categorical_logit_glm() .

stan::math::opencl_kernels::categorical_logit_glm_beta_derivative
const kernel_cl< in_out_buffer, in_out_buffer, in_buffer, in_buffer, int, int, int, int > categorical_logit_glm_beta_derivative("categorical_logit_glm_beta_derivative", {categorical_logit_glm_beta_derivative_kernel_code})
See the docs for categorical_logit_glm_beta_derivative() .

kernel_cl.hpp

stan::math::step
T step(const T &y)
The step, or Heaviside, function.
Definition step.hpp:31

stan::math::log
fvar< T > log(const fvar< T > &x)
Definition log.hpp:18

stan::math::exp
fvar< T > exp(const fvar< T > &x)
Definition exp.hpp:15

stan
The lgamma implementation in stan-math is based on either the reentrant safe lgamma_r implementation ...
Definition unit_vector_constrain.hpp:15

STRINGIFY
#define STRINGIFY(...)
Definition stringify.hpp:9