math/inv__lower__tri__multiply_8hpp_source.html

#ifndef STAN_MATH_OPENCL_KERNELS_INVERSE_LOWER_TRI_MULTIPLY_HPP

#define STAN_MATH_OPENCL_KERNELS_INVERSE_LOWER_TRI_MULTIPLY_HPP

#ifdef STAN_OPENCL


#include <stan/math/opencl/kernel_cl.hpp>

#include <stan/math/opencl/buffer_types.hpp>

#include <string>


namespace stan {

namespace math {

namespace opencl_kernels {

// \cond

static constexpr const char* inv_lower_tri_multiply_kernel_code = STRINGIFY(

    // \endcond

    __kernel void inv_lower_tri_multiply(__global double* A,

                                         __global double* temp,

                                         const int A_rows, const int rows) {

      int result_matrix_id = get_global_id(2);

      int offset = result_matrix_id * rows * 2;

      const int thread_block_row = get_local_id(0);

      const int thread_block_col = get_local_id(1);

      const int global_thread_row

          = THREAD_BLOCK_SIZE * get_group_id(0) + thread_block_row;

      const int global_thread_col

          = THREAD_BLOCK_SIZE * get_group_id(1) + thread_block_col;


      __local double C2_local[THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE];

      __local double A3_local[THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE];


      double acc[WORK_PER_THREAD] = {0};


      const int num_tiles = (rows + THREAD_BLOCK_SIZE - 1) / THREAD_BLOCK_SIZE;

      for (int tile_ind = 0; tile_ind < num_tiles; tile_ind++) {

        // Each thread copies WORK_PER_THREAD values to the local

        // memory

        for (int w = 0; w < WORK_PER_THREAD; w++) {

          const int tiled_i = THREAD_BLOCK_SIZE * tile_ind + thread_block_row;

          const int tiled_j = THREAD_BLOCK_SIZE * tile_ind + thread_block_col;

          // {C2}{A2}_global_{col}{row} specifies which global element for each

          // matrix the thread is in charge of moving to local memory.

          const int C2_global_col

              = offset + rows + tiled_j + w * THREAD_BLOCK_SIZE_COL;

          const int C2_global_row = offset + global_thread_row + rows;

          const int A3_global_col

              = offset + global_thread_col + w * THREAD_BLOCK_SIZE_COL;

          const int A3_global_row = tiled_i + rows + offset;

          // Which {col}{row} location in the local memory the thread is in

          //  charge of.

          const int local_col = thread_block_col + w * THREAD_BLOCK_SIZE_COL;

          const int local_row = thread_block_row;

          // Element above the diagonal will not be transferred.

          if (C2_global_col <= C2_global_row && C2_global_col < A_rows

              && C2_global_row < A_rows) {

            C2_local[local_col][local_row]

                = A[C2_global_col * A_rows + C2_global_row];

          } else {

            C2_local[local_col][local_row] = 0;

          }

          if (A3_global_col < A_rows && A3_global_row < A_rows) {

            A3_local[local_col][local_row]

                = A[A3_global_col * A_rows + A3_global_row];

          } else {

            A3_local[local_col][local_row] = 0.0;

          }

        }

        // Wait until all tile values are loaded to the local memory

        barrier(CLK_LOCAL_MEM_FENCE);

        for (int block_ind = 0; block_ind < THREAD_BLOCK_SIZE; block_ind++) {

          for (int w = 0; w < WORK_PER_THREAD; w++) {

            const int local_col = thread_block_col + w * THREAD_BLOCK_SIZE_COL;

            const int local_row = thread_block_row;

            acc[w] += C2_local[block_ind][local_row]

                      * A3_local[local_col][block_ind];

          }

        }

        barrier(CLK_LOCAL_MEM_FENCE);

      }

      // Global offset for each resulting submatrix

      const int batch_offset = result_matrix_id * rows * rows;

      // temp_global_{row}{col} tells the thread which local memory it needs

      //  to move to the final output

      const int temp_global_row = global_thread_row;

      // save the values

      for (int w = 0; w < WORK_PER_THREAD; w++) {

        // each thread saves WORK_PER_THREAD values

        const int temp_global_col

            = global_thread_col + w * THREAD_BLOCK_SIZE_COL;

        temp[batch_offset + temp_global_col * rows + temp_global_row] = acc[w];

      }

    }

    // \cond

);

// \endcond


const kernel_cl<in_buffer, out_buffer, int, int> inv_lower_tri_multiply(

    "inv_lower_tri_multiply",

    {thread_block_helpers, inv_lower_tri_multiply_kernel_code},

    {{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 8}});


}  // namespace opencl_kernels

}  // namespace math

}  // namespace stan

#endif

#endif

buffer_types.hpp

stan::math::opencl_kernels::inv_lower_tri_multiply
const kernel_cl< in_buffer, out_buffer, int, int > inv_lower_tri_multiply("inv_lower_tri_multiply", {thread_block_helpers, inv_lower_tri_multiply_kernel_code}, {{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 8}})
See the docs for add() .

stan::math::rows
int64_t rows(const T_x &x)
Returns the number of rows in the specified kernel generator expression.
Definition rows.hpp:22

kernel_cl.hpp

stan::math::opencl_kernels::thread_block_helpers
static const std::string thread_block_helpers
Defines a helper macro for kernels with 2D local size.
Definition helpers.hpp:24

stan
The lgamma implementation in stan-math is based on either the reentrant safe lgamma_r implementation ...
Definition unit_vector_constrain.hpp:15

STRINGIFY
#define STRINGIFY(...)
Definition stringify.hpp:9

stan::math::opencl_kernels::kernel_cl
Creates functor for kernels.
Definition kernel_cl.hpp:174