math/opencl__context_8hpp_source.html

#ifndef STAN_MATH_OPENCL_OPENCL_CONTEXT_HPP

#define STAN_MATH_OPENCL_OPENCL_CONTEXT_HPP

#ifdef STAN_OPENCL


#define DEVICE_FILTER CL_DEVICE_TYPE_ALL

#ifndef OPENCL_DEVICE_ID

#error OPENCL_DEVICE_ID_NOT_SET

#endif

#ifndef OPENCL_PLATFORM_ID

#error OPENCL_PLATFORM_ID_NOT_SET

#endif


#include <stan/math/opencl/matrix_cl_view.hpp>

#include <stan/math/opencl/err/check_opencl.hpp>


#include <CL/opencl.hpp>

#include <tbb/concurrent_vector.h>

#include <string>

#include <iostream>

#include <fstream>

#include <unordered_map>

#include <vector>

#include <cmath>

#include <cerrno>


namespace stan {

namespace math {


class opencl_context_base {

  friend class opencl_context;


 private:

  opencl_context_base(int platform_id = OPENCL_PLATFORM_ID,

                      int device_id = OPENCL_DEVICE_ID) {

    try {

      // platform

      cl::Platform::get(&platforms_);

      if (platform_id >= platforms_.size()) {

        system_error("OpenCL Initialization", "[Platform]", -1,

                     "CL_INVALID_PLATFORM");

      }

      platform_.push_back(platforms_[platform_id]);

      platform_name_ = platform_[0].getInfo<CL_PLATFORM_NAME>();

      platform_[0].getDevices(DEVICE_FILTER, &devices_);

      if (devices_.size() == 0) {

        system_error("OpenCL Initialization", "[Device]", -1,

                     "CL_DEVICE_NOT_FOUND");

      }

      if (device_id >= devices_.size()) {

        system_error("OpenCL Initialization", "[Device]", -1,

                     "CL_INVALID_DEVICE");

      }

      device_.push_back(devices_[device_id]);

      // context and queue

      cl_command_queue_properties device_properties;

      device_[0].getInfo<cl_command_queue_properties>(

          CL_DEVICE_QUEUE_PROPERTIES, &device_properties);

      device_[0].getInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE,

                                 &max_thread_block_size_);

      std::vector<size_t> max_wg_sizes

          = device_[0].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();

      if (max_wg_sizes.size() < 3) {

        system_error("OpenCL Initialization", "[Device]", -1,

                     "The device does not support 3D work groups!");

      }


      context_ = cl::Context(device_[0]);

      if (device_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {

        command_queue_

            = cl::CommandQueue(context_, device_[0],

                               CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, nullptr);

        in_order_ = CL_FALSE;

      } else {

        command_queue_ = cl::CommandQueue(context_, device_[0], 0, nullptr);

        in_order_ = CL_TRUE;

      }

      int max_square_block_size

          = std::min({max_wg_sizes[0], max_wg_sizes[1],

                      static_cast<size_t>(std::sqrt(max_thread_block_size_))});


      // Does a compile time check of the maximum allowed

      // dimension of a square thread block size

      // WG size of (32,32) works on all recent GPUs but would fail on some

      // older integrated GPUs or CPUs

      if (max_square_block_size < base_opts_["THREAD_BLOCK_SIZE"]) {

        base_opts_["THREAD_BLOCK_SIZE"] = max_square_block_size;

        base_opts_["WORK_PER_THREAD"] = 1;

      }

      if (std::min(max_thread_block_size_, max_wg_sizes[0])

          < base_opts_["LOCAL_SIZE_"]) {

        // must be a power of base_opts_["REDUCTION_STEP_SIZE"]

        const int p = std::log(max_thread_block_size_)

                      / std::log(base_opts_["REDUCTION_STEP_SIZE"]);

        base_opts_["LOCAL_SIZE_"]

            = std::pow(base_opts_["REDUCTION_STEP_SIZE"], p);

      }

      // Thread block size for the Cholesky

      // TODO(Steve): This should be tuned in a higher part of the stan language

      if (max_thread_block_size_ >= 256) {

        tuning_opts_.cholesky_min_L11_size = 256;

      } else {

        tuning_opts_.cholesky_min_L11_size = max_thread_block_size_;

      }

    } catch (const cl::Error& e) {

      check_opencl_error("opencl_context", e);

    }

  }


 protected:

  cl::Context context_;  // Manages the the device, queue, platform, memory, etc

  cl::CommandQueue command_queue_;       // job queue for device, one per device

  std::vector<cl::Platform> platforms_;  // Vector of available platforms

  std::vector<cl::Platform> platform_;   // The platform for compiling kernels

  std::string platform_name_;  // The platform such as NVIDIA OpenCL or AMD SDK

  std::vector<cl::Device> device_;   // The selected OpenCL device

  std::vector<cl::Device> devices_;  // All available OpenCL devices

  std::string device_name_;          // The name of OpenCL device

  size_t max_thread_block_size_;  // The maximum size of a block of workers on

                                  // the device

  bool in_order_;                 // Whether to use out of order execution.

  // Holds Default parameter values for each Kernel.

  using map_base_opts = std::unordered_map<std::string, int>;

  map_base_opts base_opts_

      = {{"LOWER", static_cast<int>(matrix_cl_view::Lower)},

         {"UPPER", static_cast<int>(matrix_cl_view::Upper)},

         {"ENTIRE", static_cast<int>(matrix_cl_view::Entire)},

         {"DIAGONAL", static_cast<int>(matrix_cl_view::Diagonal)},

         {"THREAD_BLOCK_SIZE", 32},

         {"WORK_PER_THREAD", 8},

         {"REDUCTION_STEP_SIZE", 4},

         {"LOCAL_SIZE_", 4096}};

  // TODO(Steve): Make these tunable during warmup

  struct tuning_struct {

    // Used in math/opencl/cholesky_decompose

    int cholesky_min_L11_size = 256;

    int cholesky_partition = 4;

    int cholesky_size_worth_transfer = 1250;

    // Used in math/rev/fun/cholesky_decompose

    int cholesky_rev_min_block_size = 512;

    int cholesky_rev_block_partition = 8;

    // used in math/opencl/multiply

    int multiply_wgs_per_compute_unit = 5;

    // used in math/prim/fun/gp_exp_quad_cov

    double gp_exp_quad_cov_complex = 1'000'000;

    double gp_exp_quad_cov_simple = 1'250;

    // used in math/prim/fun/multiply

    // and math/rev/fun/multiply

    int multiply_dim_prod_worth_transfer = 2000000;

    // used in math/prim/fun/mdivide_left_tri

    // and math/rev/fun/mdivide_left_tri

    int tri_inverse_size_worth_transfer = 100;

  } tuning_opts_;


 protected:

  static opencl_context_base& getInstance() noexcept {

    static opencl_context_base instance_;

    return instance_;

  }


  static void select_device(int platform_id, int device_id) {

    getInstance() = opencl_context_base(platform_id, device_id);

  }

};


class opencl_context {

  tbb::concurrent_vector<cl::Kernel*> kernel_caches_;


 public:

  opencl_context() = default;


  inline std::string description() const {

    std::ostringstream msg;


    msg << "Platform ID: " << OPENCL_DEVICE_ID << "\n";

    msg << "Platform Name: "

        << opencl_context_base::getInstance()

               .platform_[0]

               .getInfo<CL_PLATFORM_NAME>()

        << "\n";

    msg << "Platform Vendor: "

        << opencl_context_base::getInstance()

               .platform_[0]

               .getInfo<CL_PLATFORM_VENDOR>()

        << "\n";

    msg << "\tDevice " << OPENCL_DEVICE_ID << ": "

        << "\n";

    msg << "\t\tDevice Name: "

        << opencl_context_base::getInstance()

               .device_[0]

               .getInfo<CL_DEVICE_NAME>()

        << "\n";

    msg << "\t\tDevice Type: "

        << opencl_context_base::getInstance()

               .device_[0]

               .getInfo<CL_DEVICE_TYPE>()

        << "\n";

    msg << "\t\tDevice Vendor: "

        << opencl_context_base::getInstance()

               .device_[0]

               .getInfo<CL_DEVICE_VENDOR>()

        << "\n";

    msg << "\t\tDevice Max Compute Units: "

        << opencl_context_base::getInstance()

               .device_[0]

               .getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()

        << "\n";

    msg << "\t\tDevice Global Memory: "

        << opencl_context_base::getInstance()

               .device_[0]

               .getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()

        << "\n";

    msg << "\t\tDevice Max Clock Frequency: "

        << opencl_context_base::getInstance()

               .device_[0]

               .getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>()

        << "\n";

    msg << "\t\tDevice Max Allocateable Memory: "

        << opencl_context_base::getInstance()

               .device_[0]

               .getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>()

        << "\n";

    msg << "\t\tDevice Local Memory: "

        << opencl_context_base::getInstance()

               .device_[0]

               .getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()

        << "\n";

    msg << "\t\tDevice Available: "

        << opencl_context_base::getInstance()

               .device_[0]

               .getInfo<CL_DEVICE_AVAILABLE>()

        << "\n";

    return msg.str();

  }


  inline std::string capabilities() const {

    std::vector<cl::Platform> all_platforms;

    cl::Platform::get(&all_platforms);

    std::ostringstream msg;

    int platform_id = 0;

    int device_id = 0;


    msg << "Number of Platforms: " << all_platforms.size() << "\n";

    for (auto plat_iter : all_platforms) {

      cl::Platform platform(plat_iter);


      msg << "Platform ID: " << platform_id++ << "\n";

      msg << "Platform Name: " << platform.getInfo<CL_PLATFORM_NAME>() << "\n";

      msg << "Platform Vendor: " << platform.getInfo<CL_PLATFORM_VENDOR>()

          << "\n";


      try {

        std::vector<cl::Device> all_devices;

        platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);


        for (auto device_iter : all_devices) {

          cl::Device device(device_iter);


          msg << "\tDevice " << device_id++ << ": "

              << "\n";

          msg << "\t\tDevice Name: " << device.getInfo<CL_DEVICE_NAME>()

              << "\n";

          msg << "\t\tDevice Type: " << device.getInfo<CL_DEVICE_TYPE>()

              << "\n";

          msg << "\t\tDevice Vendor: " << device.getInfo<CL_DEVICE_VENDOR>()

              << "\n";

          msg << "\t\tDevice Max Compute Units: "

              << device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << "\n";

          msg << "\t\tDevice Global Memory: "

              << device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() << "\n";

          msg << "\t\tDevice Max Clock Frequency: "

              << device.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>() << "\n";

          msg << "\t\tDevice Max Allocateable Memory: "

              << device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>() << "\n";

          msg << "\t\tDevice Local Memory: "

              << device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() << "\n";

          msg << "\t\tDevice Available: "

              << device.getInfo<CL_DEVICE_AVAILABLE>() << "\n";

        }

      } catch (const cl::Error& e) {

        // if one of the platforms have no devices that match the device type

        // it will throw the error == -1 (DEVICE_NOT_FOUND)

        // other errors will throw a system error

        if (e.err() == -1) {

          msg << "\tno (OpenCL) devices in the platform with ID " << platform_id

              << "\n";

        } else {

          check_opencl_error("capabilities", e);

        }

      }

    }

    return msg.str();

  }


  inline cl::Context& context() noexcept {

    return opencl_context_base::getInstance().context_;

  }

  inline cl::CommandQueue& queue() noexcept {

    return opencl_context_base::getInstance().command_queue_;

  }

  inline opencl_context_base::map_base_opts& base_opts() noexcept {

    return opencl_context_base::getInstance().base_opts_;

  }

  inline int max_thread_block_size() noexcept {

    return opencl_context_base::getInstance().max_thread_block_size_;

  }


  inline opencl_context_base::tuning_struct& tuning_opts() noexcept {

    return opencl_context_base::getInstance().tuning_opts_;

  }


  inline std::vector<cl::Device>& device() noexcept {

    return opencl_context_base::getInstance().device_;

  }


  inline std::vector<cl::Platform>& platform() noexcept {

    return opencl_context_base::getInstance().platform_;

  }

  inline bool& in_order() noexcept {

    return opencl_context_base::getInstance().in_order_;

  }


  inline void select_device(int platform_id, int instance_id) {

    for (cl::Kernel* cache : kernel_caches_) {

      *cache = cl::Kernel();

    }

    kernel_caches_.clear();

    opencl_context_base::select_device(platform_id, instance_id);

  }


  inline void register_kernel_cache(cl::Kernel* cache) {

    kernel_caches_.push_back(cache);

  }

};

static opencl_context opencl_context;

}  // namespace math

}  // namespace stan


#endif

#endif

check_opencl.hpp

stan::math::opencl_context_base::command_queue_
cl::CommandQueue command_queue_
Definition opencl_context.hpp:153

stan::math::opencl_context_base::map_base_opts
std::unordered_map< std::string, int > map_base_opts
Definition opencl_context.hpp:164

stan::math::opencl_context_base::select_device
static void select_device(int platform_id, int device_id)
Definition opencl_context.hpp:202

stan::math::opencl_context_base::platform_name_
std::string platform_name_
Definition opencl_context.hpp:156

stan::math::opencl_context_base::platform_
std::vector< cl::Platform > platform_
Definition opencl_context.hpp:155

stan::math::opencl_context_base::device_
std::vector< cl::Device > device_
Definition opencl_context.hpp:157

stan::math::opencl_context_base::base_opts_
map_base_opts base_opts_
Definition opencl_context.hpp:166

stan::math::opencl_context_base::max_thread_block_size_
size_t max_thread_block_size_
Definition opencl_context.hpp:160

stan::math::opencl_context_base::devices_
std::vector< cl::Device > devices_
Definition opencl_context.hpp:158

stan::math::opencl_context_base::platforms_
std::vector< cl::Platform > platforms_
Definition opencl_context.hpp:154

stan::math::opencl_context_base::getInstance
static opencl_context_base & getInstance() noexcept
Definition opencl_context.hpp:197

stan::math::opencl_context_base::device_name_
std::string device_name_
Definition opencl_context.hpp:159

stan::math::opencl_context_base::tuning_opts_
struct stan::math::opencl_context_base::tuning_struct tuning_opts_

stan::math::opencl_context_base::context_
cl::Context context_
Definition opencl_context.hpp:152

stan::math::opencl_context_base::in_order_
bool in_order_
Definition opencl_context.hpp:162

stan::math::opencl_context_base
The opencl_context_base class represents an OpenCL context in the standard Meyers singleton design pa...
Definition opencl_context.hpp:54

stan::math::opencl_context::register_kernel_cache
void register_kernel_cache(cl::Kernel *cache)
Registers a cached kernel.
Definition opencl_context.hpp:432

stan::math::opencl_context::kernel_caches_
tbb::concurrent_vector< cl::Kernel * > kernel_caches_
Definition opencl_context.hpp:211

stan::math::opencl_context::select_device
void select_device(int platform_id, int instance_id)
Selects the OpenCL device to use from now on.
Definition opencl_context.hpp:419

stan::math::opencl_context::opencl_context
opencl_context()=default

stan::math::opencl_context
The API to access the methods and values in opencl_context_base.
Definition opencl_context.hpp:210

stan::math::check_opencl_error
void check_opencl_error(const char *function, const cl::Error &e)
Throws the domain error with specifying the OpenCL error that occurred.
Definition check_opencl.hpp:23

stan::math::opencl_context_base::opencl_context_base
opencl_context_base(int platform_id=1, int device_id=1)
Construct the opencl_context by initializing the OpenCL context, devices, command queues,...
Definition opencl_context.hpp:75

stan::math::opencl_context::capabilities
std::string capabilities() const
Returns the description of the OpenCL platforms and devices that are available.
Definition opencl_context.hpp:290

stan::math::opencl_context::description
std::string description() const
Returns the description of the OpenCL platform and device that is used.
Definition opencl_context.hpp:221

stan::math::opencl_context::base_opts
opencl_context_base::map_base_opts & base_opts() noexcept
Returns a copy of the map of kernel defines.
Definition opencl_context.hpp:369

stan::math::opencl_context::in_order
bool & in_order() noexcept
Return a bool representing whether the write to the OpenCL device are blocking.
Definition opencl_context.hpp:407

stan::math::opencl_context::context
cl::Context & context() noexcept
Returns the reference to the OpenCL context.
Definition opencl_context.hpp:355

stan::math::opencl_context::platform
std::vector< cl::Platform > & platform() noexcept
Returns a vector containing the OpenCL platform used to create the context.
Definition opencl_context.hpp:400

stan::math::opencl_context::device
std::vector< cl::Device > & device() noexcept
Returns a vector containing the OpenCL device used to create the context.
Definition opencl_context.hpp:393

stan::math::opencl_context::tuning_opts
opencl_context_base::tuning_struct & tuning_opts() noexcept
Returns the thread block size for the Cholesky Decompositions L_11.
Definition opencl_context.hpp:386

stan::math::opencl_context::queue
cl::CommandQueue & queue() noexcept
Returns the reference to the active OpenCL command queue for the device.
Definition opencl_context.hpp:363

stan::math::opencl_context::max_thread_block_size
int max_thread_block_size() noexcept
Returns the maximum thread block size defined by CL_DEVICE_MAX_WORK_GROUP_SIZE for the device in the ...
Definition opencl_context.hpp:379

matrix_cl_view.hpp

stan::math::e
static constexpr double e()
Return the base of the natural logarithm.
Definition constants.hpp:20

stan::math::matrix_cl_view::Upper
@ Upper

stan::math::matrix_cl_view::Entire
@ Entire

stan::math::matrix_cl_view::Diagonal
@ Diagonal

stan::math::matrix_cl_view::Lower
@ Lower

stan::math::system_error
void system_error(const char *function, const char *name, const int &y, const char *msg1, const char *msg2)
Throw a system error with a consistently formatted message.
Definition system_error.hpp:25

stan
The lgamma implementation in stan-math is based on either the reentrant safe lgamma_r implementation ...
Definition unit_vector_constrain.hpp:15

DEVICE_FILTER
#define DEVICE_FILTER
Definition opencl_context.hpp:5

stan::math::opencl_context_base::tuning_struct::cholesky_rev_block_partition
int cholesky_rev_block_partition
Definition opencl_context.hpp:182

stan::math::opencl_context_base::tuning_struct::tri_inverse_size_worth_transfer
int tri_inverse_size_worth_transfer
Definition opencl_context.hpp:193

stan::math::opencl_context_base::tuning_struct::multiply_dim_prod_worth_transfer
int multiply_dim_prod_worth_transfer
Definition opencl_context.hpp:190

stan::math::opencl_context_base::tuning_struct::cholesky_size_worth_transfer
int cholesky_size_worth_transfer
Definition opencl_context.hpp:179

stan::math::opencl_context_base::tuning_struct::cholesky_rev_min_block_size
int cholesky_rev_min_block_size
Definition opencl_context.hpp:181

stan::math::opencl_context_base::tuning_struct::multiply_wgs_per_compute_unit
int multiply_wgs_per_compute_unit
Definition opencl_context.hpp:184

stan::math::opencl_context_base::tuning_struct::gp_exp_quad_cov_complex
double gp_exp_quad_cov_complex
Definition opencl_context.hpp:186

stan::math::opencl_context_base::tuning_struct::cholesky_partition
int cholesky_partition
Definition opencl_context.hpp:178

stan::math::opencl_context_base::tuning_struct::gp_exp_quad_cov_simple
double gp_exp_quad_cov_simple
Definition opencl_context.hpp:187

stan::math::opencl_context_base::tuning_struct::cholesky_min_L11_size
int cholesky_min_L11_size
Definition opencl_context.hpp:177

stan::math::opencl_context_base::tuning_struct
Definition opencl_context.hpp:175