math/mpi__parallel__call_8hpp_source.html

#ifdef STAN_MPI


#ifndef STAN_MATH_PRIM_FUNCTOR_MPI_PARALLEL_CALL_HPP

#define STAN_MATH_PRIM_FUNCTOR_MPI_PARALLEL_CALL_HPP


#include <stan/math/prim/meta.hpp>

#include <stan/math/prim/functor/mpi_cluster.hpp>

#include <stan/math/prim/functor/mpi_distributed_apply.hpp>

#include <stan/math/prim/fun/to_array_1d.hpp>

#include <stan/math/prim/fun/dims.hpp>


#include <mutex>

#include <algorithm>

#include <vector>

#include <type_traits>

#include <functional>


namespace stan {

namespace math {


namespace internal {


template <int call_id, int member, typename T>

class mpi_parallel_call_cache {

  static T local_;

  static bool is_valid_;


 public:

  using cache_t = const T;


  mpi_parallel_call_cache() = delete;

  mpi_parallel_call_cache(const mpi_parallel_call_cache<call_id, member, T>&)

      = delete;

  mpi_parallel_call_cache& operator=(

      const mpi_parallel_call_cache<call_id, member, T>&)

      = delete;


  static bool is_valid() { return is_valid_; }


  static void store(const T& data) {

    if (is_valid_)

      throw std::runtime_error("Cache can only store a single data item.");

    local_ = data;

    is_valid_ = true;

  }


  static cache_t& data() {

    if (unlikely(!is_valid_))

      throw std::runtime_error("Cache not yet valid.");

    return local_;

  }

};


template <int call_id, int member, typename T>

T mpi_parallel_call_cache<call_id, member, T>::local_;


template <int call_id, int member, typename T>

bool mpi_parallel_call_cache<call_id, member, T>::is_valid_ = false;


}  // namespace internal


template <int call_id, typename ReduceF, typename CombineF>

class mpi_parallel_call {

  boost::mpi::communicator world_;

  const std::size_t rank_ = world_.rank();

  const std::size_t world_size_ = world_.size();

  std::unique_lock<std::mutex> cluster_lock_;


  using result_t = typename CombineF::result_t;


  // local caches which hold local slices of data

  using cache_x_r

      = internal::mpi_parallel_call_cache<call_id, 1,

                                          std::vector<std::vector<double>>>;

  using cache_x_i

      = internal::mpi_parallel_call_cache<call_id, 2,

                                          std::vector<std::vector<int>>>;

  using cache_f_out

      = internal::mpi_parallel_call_cache<call_id, 3, std::vector<int>>;

  using cache_chunks

      = internal::mpi_parallel_call_cache<call_id, 4, std::vector<int>>;


  // # of outputs for given call_id+ReduceF+CombineF case

  static int num_outputs_per_job_;


  CombineF combine_;


  vector_d local_shared_params_dbl_;

  matrix_d local_job_params_dbl_;


 public:

  template <typename T_shared_param, typename T_job_param>

  mpi_parallel_call(

      const T_shared_param& shared_params,

      const std::vector<Eigen::Matrix<T_job_param, Eigen::Dynamic, 1>>&

          job_params,

      const std::vector<std::vector<double>>& x_r,

      const std::vector<std::vector<int>>& x_i)

      : combine_(shared_params, job_params) {

    if (rank_ != 0)

      throw std::runtime_error(

          "problem sizes may only be defined on the root.");


    check_matching_sizes("mpi_parallel_call", "job parameters", job_params,

                         "continuous data", x_r);

    check_matching_sizes("mpi_parallel_call", "job parameters", job_params,

                         "integer data", x_i);


    if (cache_chunks::is_valid()) {

      int cached_num_jobs = sum(cache_chunks::data());

      check_size_match("mpi_parallel_call", "cached number of jobs",

                       cached_num_jobs, "number of jobs", job_params.size());

    }


    // make children aware of upcoming job & obtain cluster lock

    cluster_lock_ = mpi_broadcast_command<stan::math::mpi_distributed_apply<

        mpi_parallel_call<call_id, ReduceF, CombineF>>>();


    const std::vector<int> job_dims = dims(job_params);


    const size_type num_jobs = job_dims[0];

    const size_type num_job_params = num_jobs == 0 ? 0 : job_dims[1];


    const vector_d shared_params_dbl = value_of(shared_params);

    matrix_d job_params_dbl(num_job_params, num_jobs);


    for (int j = 0; j < num_jobs; ++j)

      job_params_dbl.col(j) = value_of(job_params[j]);


    setup_call(shared_params_dbl, job_params_dbl, x_r, x_i);

  }


  // called on remote sites

  mpi_parallel_call() : combine_() {

    if (rank_ == 0)

      throw std::runtime_error("problem sizes must be defined on the root.");


    setup_call(vector_d(), matrix_d(), std::vector<std::vector<double>>(),

               std::vector<std::vector<int>>());

  }


  static void distributed_apply() {

    // call constructor for the remotes

    mpi_parallel_call<call_id, ReduceF, CombineF> job_chunk;


    job_chunk.reduce_combine();

  }


  result_t reduce_combine() {

    const std::vector<int>& job_chunks = cache_chunks::data();

    const int num_jobs = sum(job_chunks);


    const int first_job

        = std::accumulate(job_chunks.begin(), job_chunks.begin() + rank_, 0);

    const int num_local_jobs = local_job_params_dbl_.cols();

    int local_outputs_per_job = num_local_jobs == 0 ? 0 : num_outputs_per_job_;

    matrix_d local_output(

        local_outputs_per_job == -1 ? 0 : local_outputs_per_job,

        num_local_jobs);

    std::vector<int> local_f_out(num_local_jobs, -1);


    typename cache_x_r::cache_t& local_x_r = cache_x_r::data();

    typename cache_x_i::cache_t& local_x_i = cache_x_i::data();


    // check if we know already output sizes

    if (cache_f_out::is_valid()) {

      typename cache_f_out::cache_t& f_out = cache_f_out::data();

      const int num_outputs

          = std::accumulate(f_out.begin() + first_job,

                            f_out.begin() + first_job + num_local_jobs, 0);

      local_output.resize(Eigen::NoChange, num_outputs);

    }


    int local_ok = 1;

    try {

      for (int i = 0, offset = 0; i < num_local_jobs;

           offset += local_f_out[i], ++i) {

        const matrix_d job_output

            = ReduceF()(local_shared_params_dbl_, local_job_params_dbl_.col(i),

                        local_x_r[i], local_x_i[i], 0);

        local_f_out[i] = job_output.cols();


        if (local_outputs_per_job == -1) {

          local_outputs_per_job = job_output.rows();

          // changing rows, but leave column size as is

          local_output.conservativeResize(local_outputs_per_job,

                                          Eigen::NoChange);

        }


        if (local_output.cols() < offset + local_f_out[i])

          // leave row size, but change columns size

          local_output.conservativeResize(Eigen::NoChange,

                                          2 * (offset + local_f_out[i]));


        local_output.block(0, offset, local_output.rows(), local_f_out[i])

            = job_output;

      }

    } catch (const std::exception& e) {

      // see note 1 above for an explanation why we do not rethrow

      // here, but mereley flag it to keep the cluster synchronized

      local_ok = 0;

    }


    // during first execution we distribute the output sizes from

    // local jobs to the root. This needs to be done for the number of

    // outputs of each result and the number of outputs per job.

    if (num_outputs_per_job_ == -1) {

      // before we can cache the sizes locally we must ensure

      // that no exception has been fired from any node. Hence,

      // share the info about the current status across all nodes.

      int cluster_status = 0;

      boost::mpi::reduce(world_, local_ok, cluster_status, std::plus<int>(), 0);

      bool all_ok = cluster_status == static_cast<int>(world_size_);

      boost::mpi::broadcast(world_, all_ok, 0);

      if (!all_ok) {

        // err out on the root

        if (rank_ == 0) {

          throw std::domain_error("MPI error on first evaluation.");

        }

        // and ensure on the workers that they return into their

        // listening state

        return result_t();

      }


      // execution was ok everywhere such that we can now distribute the

      // outputs per job as recorded on the root on the first execution

      boost::mpi::broadcast(world_, local_outputs_per_job, 0);

      num_outputs_per_job_ = local_outputs_per_job;


      // the f_out cache may not yet be synchronized which we can do

      // now given that this first execution was just fine

      if (!cache_f_out::is_valid()) {

        std::vector<int> world_f_out(num_jobs, 0);

        boost::mpi::gatherv(world_, local_f_out.data(), num_local_jobs,

                            world_f_out.data(), job_chunks, 0);

        // on the root we now have all sizes from all children. Copy

        // over the local sizes to the world vector on each local node

        // in order to cache this information locally.

        std::copy(local_f_out.begin(), local_f_out.end(),

                  world_f_out.begin() + first_job);

        cache_f_out::store(world_f_out);

      }

    }


    typename cache_f_out::cache_t& world_f_out = cache_f_out::data();


    // check that cached sizes are the same as just collected from

    // this evaluation

    for (int i = 0; i < num_local_jobs; ++i) {

      if (world_f_out[first_job + i] != local_f_out[i]) {

        // in case of a mismatch we mark so, but flag the error only

        // on the root, see note 1 above.

        local_ok = 0;

        break;

      }

    }


    const std::size_t size_world_f_out = sum(world_f_out);

    matrix_d world_result(num_outputs_per_job_, size_world_f_out);


    std::vector<int> chunks_result(world_size_, 0);

    for (std::size_t i = 0, k = 0; i != world_size_; ++i)

      for (int j = 0; j != job_chunks[i]; ++j, ++k)

        chunks_result[i] += world_f_out[k] * num_outputs_per_job_;


    // collect results on root

    boost::mpi::gatherv(world_, local_output.data(), chunks_result[rank_],

                        world_result.data(), chunks_result, 0);


    // let root know if all went fine everywhere

    int cluster_status = 0;

    boost::mpi::reduce(world_, local_ok, cluster_status, std::plus<int>(), 0);

    bool all_ok = cluster_status == static_cast<int>(world_size_);


    // on the workers all is done now.

    if (rank_ != 0)

      return result_t();


    // in case something went wrong we throw on the root

    if (!all_ok)

      throw std::domain_error("Error during MPI evaluation.");


    return combine_(world_result, world_f_out);

  }


 private:

  template <typename T_cache>

  typename T_cache::cache_t& scatter_array_2d_cached(

      typename T_cache::cache_t& data) {

    // distribute data only if not in cache yet

    if (T_cache::is_valid()) {

      return T_cache::data();

    }


    // number of elements of each data item must be transferred to

    // the workers

    std::vector<int> data_dims = dims(data);

    data_dims.resize(2);


    boost::mpi::broadcast(world_, data_dims.data(), 2, 0);


    const std::vector<int> job_chunks = mpi_map_chunks(data_dims[0], 1);

    const std::vector<int> data_chunks

        = mpi_map_chunks(data_dims[0], data_dims[1]);


    auto flat_data = to_array_1d(data);

    decltype(flat_data) local_flat_data(data_chunks[rank_]);


    if (data_dims[0] * data_dims[1] > 0) {

      if (rank_ == 0) {

        boost::mpi::scatterv(world_, flat_data.data(), data_chunks,

                             local_flat_data.data(), 0);

      } else {

        boost::mpi::scatterv(world_, local_flat_data.data(), data_chunks[rank_],

                             0);

      }

    }


    std::vector<decltype(flat_data)> local_data;

    auto local_iter = local_flat_data.begin();

    for (int i = 0; i != job_chunks[rank_]; ++i) {

      typename T_cache::cache_t::value_type const data_elem(

          local_iter, local_iter + data_dims[1]);

      local_data.push_back(data_elem);

      local_iter += data_dims[1];

    }


    // finally we cache it locally

    T_cache::store(local_data);

    return T_cache::data();

  }


  template <typename T_cache>

  typename T_cache::cache_t& broadcast_array_1d_cached(

      typename T_cache::cache_t& data) {

    if (T_cache::is_valid()) {

      return T_cache::data();

    }


    std::size_t data_size = data.size();

    boost::mpi::broadcast(world_, data_size, 0);


    std::vector<typename T_cache::cache_t::value_type> local_data = data;

    local_data.resize(data_size);


    boost::mpi::broadcast(world_, local_data.data(), data_size, 0);

    T_cache::store(local_data);

    return T_cache::data();

  }


  template <int meta_cache_id>

  vector_d broadcast_vector(const vector_d& data) {

    using meta_cache

        = internal::mpi_parallel_call_cache<call_id, meta_cache_id,

                                            std::vector<size_type>>;

    const std::vector<size_type>& data_size

        = broadcast_array_1d_cached<meta_cache>({data.size()});


    vector_d local_data = data;

    local_data.resize(data_size[0]);


    boost::mpi::broadcast(world_, local_data.data(), data_size[0], 0);


    return local_data;

  }


  template <int meta_cache_id>

  matrix_d scatter_matrix(const matrix_d& data) {

    using meta_cache

        = internal::mpi_parallel_call_cache<call_id, meta_cache_id,

                                            std::vector<size_type>>;

    const std::vector<size_type>& dims

        = broadcast_array_1d_cached<meta_cache>({data.rows(), data.cols()});

    const size_type rows = dims[0];

    const size_type total_cols = dims[1];


    const std::vector<int> job_chunks = mpi_map_chunks(total_cols, 1);

    const std::vector<int> data_chunks = mpi_map_chunks(total_cols, rows);

    matrix_d local_data(rows, job_chunks[rank_]);

    if (rows * total_cols > 0) {

      if (rank_ == 0) {

        boost::mpi::scatterv(world_, data.data(), data_chunks,

                             local_data.data(), 0);

      } else {

        boost::mpi::scatterv(world_, local_data.data(), data_chunks[rank_], 0);

      }

    }


    return local_data;

  }


  void setup_call(const vector_d& shared_params, const matrix_d& job_params,

                  const std::vector<std::vector<double>>& x_r,

                  const std::vector<std::vector<int>>& x_i) {

    std::vector<int> job_chunks = mpi_map_chunks(job_params.cols(), 1);

    broadcast_array_1d_cached<cache_chunks>(job_chunks);


    local_shared_params_dbl_ = broadcast_vector<-1>(shared_params);

    local_job_params_dbl_ = scatter_matrix<-2>(job_params);


    // distribute const data if not yet cached

    scatter_array_2d_cached<cache_x_r>(x_r);

    scatter_array_2d_cached<cache_x_i>(x_i);

  }

};


template <int call_id, typename ReduceF, typename CombineF>

int mpi_parallel_call<call_id, ReduceF, CombineF>::num_outputs_per_job_ = -1;


}  // namespace math

}  // namespace stan


#endif


#endif

stan::math::internal::mpi_parallel_call_cache::local_
static T local_
Definition mpi_parallel_call.hpp:42

stan::math::internal::mpi_parallel_call_cache::mpi_parallel_call_cache
mpi_parallel_call_cache()=delete

stan::math::internal::mpi_parallel_call_cache::is_valid_
static bool is_valid_
Definition mpi_parallel_call.hpp:43

stan::math::internal::mpi_parallel_call_cache::operator=
mpi_parallel_call_cache & operator=(const mpi_parallel_call_cache< call_id, member, T > &)=delete

stan::math::internal::mpi_parallel_call_cache::mpi_parallel_call_cache
mpi_parallel_call_cache(const mpi_parallel_call_cache< call_id, member, T > &)=delete

stan::math::internal::mpi_parallel_call_cache::store
static void store(const T &data)
Store data to be cached locally.
Definition mpi_parallel_call.hpp:66

stan::math::internal::mpi_parallel_call_cache::is_valid
static bool is_valid()
Query if cache is in valid which it is once data has been stored.
Definition mpi_parallel_call.hpp:59

stan::math::internal::mpi_parallel_call_cache::cache_t
const T cache_t
Definition mpi_parallel_call.hpp:46

stan::math::internal::mpi_parallel_call_cache::data
static cache_t & data()
Obtain const reference to locally cached data if cache is valid (throws otherwise).
Definition mpi_parallel_call.hpp:78

stan::math::internal::mpi_parallel_call_cache
Container for locally cached data which is essentially implemented as singleton.
Definition mpi_parallel_call.hpp:41

stan::math::mpi_parallel_call::result_t
typename CombineF::result_t result_t
Definition mpi_parallel_call.hpp:167

stan::math::mpi_parallel_call::broadcast_vector
vector_d broadcast_vector(const vector_d &data)
Broadcasts an Eigen vector to the cluster.
Definition mpi_parallel_call.hpp:513

stan::math::mpi_parallel_call::world_size_
const std::size_t world_size_
Definition mpi_parallel_call.hpp:164

stan::math::mpi_parallel_call::scatter_matrix
matrix_d scatter_matrix(const matrix_d &data)
Scatters an Eigen matrix column wise over the cluster.
Definition mpi_parallel_call.hpp:541

stan::math::mpi_parallel_call::mpi_parallel_call
mpi_parallel_call(const T_shared_param &shared_params, const std::vector< Eigen::Matrix< T_job_param, Eigen::Dynamic, 1 > > &job_params, const std::vector< std::vector< double > > &x_r, const std::vector< std::vector< int > > &x_i)
Initiates a parallel MPI call on the root.
Definition mpi_parallel_call.hpp:204

stan::math::mpi_parallel_call::reduce_combine
result_t reduce_combine()
Once all data is distributed and cached the reduce_combine evaluates all assigned function evaluation...
Definition mpi_parallel_call.hpp:269

stan::math::mpi_parallel_call::setup_call
void setup_call(const vector_d &shared_params, const matrix_d &job_params, const std::vector< std::vector< double > > &x_r, const std::vector< std::vector< int > > &x_i)
Definition mpi_parallel_call.hpp:565

stan::math::mpi_parallel_call::local_job_params_dbl_
matrix_d local_job_params_dbl_
Definition mpi_parallel_call.hpp:187

stan::math::mpi_parallel_call::broadcast_array_1d_cached
T_cache::cache_t & broadcast_array_1d_cached(typename T_cache::cache_t &data)
Performs a cached broadcast of a 1D array (std::vector).
Definition mpi_parallel_call.hpp:483

stan::math::mpi_parallel_call::num_outputs_per_job_
static int num_outputs_per_job_
Definition mpi_parallel_call.hpp:182

stan::math::mpi_parallel_call::distributed_apply
static void distributed_apply()
Entry point on the workers for the mpi_parallel_call.
Definition mpi_parallel_call.hpp:256

stan::math::mpi_parallel_call::combine_
CombineF combine_
Definition mpi_parallel_call.hpp:184

stan::math::mpi_parallel_call::local_shared_params_dbl_
vector_d local_shared_params_dbl_
Definition mpi_parallel_call.hpp:186

stan::math::mpi_parallel_call::mpi_parallel_call
mpi_parallel_call()
Definition mpi_parallel_call.hpp:245

stan::math::mpi_parallel_call::rank_
const std::size_t rank_
Definition mpi_parallel_call.hpp:163

stan::math::mpi_parallel_call::world_
boost::mpi::communicator world_
Definition mpi_parallel_call.hpp:162

stan::math::mpi_parallel_call::cluster_lock_
std::unique_lock< std::mutex > cluster_lock_
Definition mpi_parallel_call.hpp:165

stan::math::mpi_parallel_call::scatter_array_2d_cached
T_cache::cache_t & scatter_array_2d_cached(typename T_cache::cache_t &data)
Performs a cached scatter of a 2D array (nested std::vector).
Definition mpi_parallel_call.hpp:423

stan::math::mpi_parallel_call
The MPI parallel call class manages the distributed evaluation of a collection of tasks following the...
Definition mpi_parallel_call.hpp:161

unlikely
#define unlikely(x)
Definition compiler_attributes.hpp:40

stan::math::rows
int64_t rows(const T_x &x)
Returns the number of rows in the specified kernel generator expression.
Definition rows.hpp:22

stan::math::to_array_1d
auto to_array_1d(T_x &&x)
Returns input matrix reshaped into a vector.
Definition to_array_1d.hpp:21

stan::math::dims
void dims(const T_x &x, std::vector< int > &result)
matrix_cl overload of the dims helper function in prim/fun/dims.hpp.
Definition dims.hpp:21

mpi_cluster.hpp

mpi_distributed_apply.hpp

stan::math::e
static constexpr double e()
Return the base of the natural logarithm.
Definition constants.hpp:20

stan::math::vector_d
Eigen::Matrix< double, Eigen::Dynamic, 1 > vector_d
Type for (column) vector of double values.
Definition typedefs.hpp:24

stan::math::value_of
T value_of(const fvar< T > &v)
Return the value of the specified variable.
Definition value_of.hpp:18

stan::math::check_matching_sizes
void check_matching_sizes(const char *function, const char *name1, const T_y1 &y1, const char *name2, const T_y2 &y2)
Check if two structures at the same size.
Definition check_matching_sizes.hpp:24

stan::math::mpi_map_chunks
std::vector< int > mpi_map_chunks(std::size_t num_jobs, std::size_t chunk_size=1)
Maps jobs of given chunk size to workers and returning a vector of counts.
Definition mpi_cluster.hpp:84

stan::math::size_type
Eigen::Matrix< double, Eigen::Dynamic, Eigen::Dynamic >::Index size_type
Type for sizes and indexes in an Eigen matrix with double elements.
Definition typedefs.hpp:11

stan::math::sum
auto sum(const std::vector< T > &m)
Return the sum of the entries of the specified standard vector.
Definition sum.hpp:23

stan::math::matrix_d
Eigen::Matrix< double, Eigen::Dynamic, Eigen::Dynamic > matrix_d
Type for matrix of double values.
Definition typedefs.hpp:19

stan::math::mpi_broadcast_command
std::unique_lock< std::mutex > mpi_broadcast_command()
Broadcasts default constructible commands to the cluster.
Definition mpi_cluster.hpp:238

stan::math::check_size_match
void check_size_match(const char *function, const char *name_i, T_size1 i, const char *name_j, T_size2 j)
Check if the provided sizes match.
Definition check_size_match.hpp:24

stan
The lgamma implementation in stan-math is based on either the reentrant safe lgamma_r implementation ...
Definition unit_vector_constrain.hpp:15

dims.hpp

to_array_1d.hpp

meta.hpp

stan::math::mpi_distributed_apply
MPI command template which calls the static method distributed_apply of the given class F.
Definition mpi_distributed_apply.hpp:27