math/rev_2functor_2reduce__sum_8hpp_source.html

#ifndef STAN_MATH_REV_FUNCTOR_REDUCE_SUM_HPP

#define STAN_MATH_REV_FUNCTOR_REDUCE_SUM_HPP


#include <stan/math/prim/meta.hpp>

#include <stan/math/prim/functor.hpp>

#include <stan/math/rev/core.hpp>


#include <tbb/task_arena.h>

#include <tbb/parallel_reduce.h>

#include <tbb/blocked_range.h>


#include <tuple>

#include <memory>

#include <utility>

#include <vector>


namespace stan {

namespace math {

namespace internal {


template <typename ReduceFunction, typename ReturnType, typename Vec,

          typename... Args>

struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType,

                       Vec, Args...> {

  struct scoped_args_tuple {

    ScopedChainableStack stack_;

    using args_tuple_t

        = std::tuple<decltype(deep_copy_vars(std::declval<Args>()))...>;

    std::unique_ptr<args_tuple_t> args_tuple_holder_;


    scoped_args_tuple() : stack_(), args_tuple_holder_(nullptr) {}

  };


  struct recursive_reducer {

    const size_t num_vars_per_term_;

    const size_t num_vars_shared_terms_;  // Number of vars in shared arguments

    double* sliced_partials_;  // Points to adjoints of the partial calculations

    Vec vmapped_;

    std::stringstream msgs_;

    std::tuple<Args...> args_tuple_;

    scoped_args_tuple local_args_tuple_scope_;

    double sum_{0.0};

    Eigen::VectorXd args_adjoints_{0};


    template <typename VecT, typename... ArgsT>

    recursive_reducer(size_t num_vars_per_term, size_t num_vars_shared_terms,

                      double* sliced_partials, VecT&& vmapped, ArgsT&&... args)

        : num_vars_per_term_(num_vars_per_term),

          num_vars_shared_terms_(num_vars_shared_terms),

          sliced_partials_(sliced_partials),

          vmapped_(std::forward<VecT>(vmapped)),

          args_tuple_(std::forward<ArgsT>(args)...) {}


    /*

     * This is the copy operator as required for tbb::parallel_reduce

     *   Imperative form. This requires sum_ and arg_adjoints_ be reset

     *   to zero since the newly created reducer is used to accumulate

     *   an independent partial sum.

     */

    recursive_reducer(recursive_reducer& other, tbb::split)

        : num_vars_per_term_(other.num_vars_per_term_),

          num_vars_shared_terms_(other.num_vars_shared_terms_),

          sliced_partials_(other.sliced_partials_),

          vmapped_(other.vmapped_),

          args_tuple_(other.args_tuple_) {}


    inline void operator()(const tbb::blocked_range<size_t>& r) {

      if (r.empty()) {

        return;

      }


      if (args_adjoints_.size() == 0) {

        args_adjoints_ = Eigen::VectorXd::Zero(num_vars_shared_terms_);

      }


      // Obtain reference to a local copy of all shared arguments that do

      // not point

      //   back to main autodiff stack


      if (!local_args_tuple_scope_.args_tuple_holder_) {

        // shared arguments need to be copied to reducer-specific

        // scope. In this case no need for zeroing adjoints, since the

        // fresh copy has all adjoints set to zero.

        local_args_tuple_scope_.stack_.execute([&]() {

          math::apply(

              [&](auto&&... args) {

                local_args_tuple_scope_.args_tuple_holder_ = std::make_unique<

                    typename scoped_args_tuple::args_tuple_t>(

                    deep_copy_vars(args)...);

              },

              args_tuple_);

        });

      } else {

        // set adjoints of shared arguments to zero

        local_args_tuple_scope_.stack_.execute([] { set_zero_all_adjoints(); });

      }


      auto& args_tuple_local = *(local_args_tuple_scope_.args_tuple_holder_);


      // Initialize nested autodiff stack

      const nested_rev_autodiff begin_nest;


      // Create nested autodiff copies of sliced argument that do not point

      //   back to main autodiff stack

      std::decay_t<Vec> local_sub_slice;

      local_sub_slice.reserve(r.size());

      for (size_t i = r.begin(); i < r.end(); ++i) {

        local_sub_slice.emplace_back(deep_copy_vars(vmapped_[i]));

      }


      // Perform calculation

      var sub_sum_v = math::apply(

          [&](auto&&... args) {

            return ReduceFunction()(local_sub_slice, r.begin(), r.end() - 1,

                                    &msgs_, args...);

          },

          args_tuple_local);


      // Compute Jacobian

      sub_sum_v.grad();


      // Accumulate value of reduce_sum

      sum_ += sub_sum_v.val();


      // Accumulate adjoints of sliced_arguments

      accumulate_adjoints(sliced_partials_ + r.begin() * num_vars_per_term_,

                          std::move(local_sub_slice));


      // Accumulate adjoints of shared_arguments

      math::apply(

          [&](auto&&... args) {

            accumulate_adjoints(args_adjoints_.data(), args...);

          },

          args_tuple_local);

    }


    inline void join(const recursive_reducer& rhs) {

      sum_ += rhs.sum_;

      if (args_adjoints_.size() != 0 && rhs.args_adjoints_.size() != 0) {

        args_adjoints_ += rhs.args_adjoints_;

      } else if (args_adjoints_.size() == 0 && rhs.args_adjoints_.size() != 0) {

        args_adjoints_ = rhs.args_adjoints_;

      }

      msgs_ << rhs.msgs_.str();

    }

  };


  inline var operator()(Vec&& vmapped, bool auto_partitioning, int grainsize,

                        std::ostream* msgs, Args&&... args) const {

    if (vmapped.empty()) {

      return var(0.0);

    }


    const std::size_t num_terms = vmapped.size();

    const std::size_t num_vars_per_term = count_vars(vmapped[0]);

    const std::size_t num_vars_sliced_terms = num_terms * num_vars_per_term;

    const std::size_t num_vars_shared_terms = count_vars(args...);


    vari** varis = ChainableStack::instance_->memalloc_.alloc_array<vari*>(

        num_vars_sliced_terms + num_vars_shared_terms);

    double* partials = ChainableStack::instance_->memalloc_.alloc_array<double>(

        num_vars_sliced_terms + num_vars_shared_terms);


    save_varis(varis, vmapped);

    save_varis(varis + num_vars_sliced_terms, args...);


    for (size_t i = 0; i < num_vars_sliced_terms; ++i) {

      partials[i] = 0.0;

    }


    recursive_reducer worker(num_vars_per_term, num_vars_shared_terms, partials,

                             std::forward<Vec>(vmapped),

                             std::forward<Args>(args)...);


    // we must use task isolation as described here:

    // https://software.intel.com/content/www/us/en/develop/documentation/tbb-documentation/top/intel-threading-building-blocks-developer-guide/task-isolation.html

    // this is to ensure that the thread local AD tape ressource is

    // not being modified from a different task which may happen

    // whenever this function is being used itself in a parallel

    // context (like running multiple chains for Stan)

    tbb::this_task_arena::isolate([&] {

      if (auto_partitioning) {

        tbb::parallel_reduce(

            tbb::blocked_range<std::size_t>(0, num_terms, grainsize), worker);

      } else {

        tbb::simple_partitioner partitioner;

        tbb::parallel_deterministic_reduce(

            tbb::blocked_range<std::size_t>(0, num_terms, grainsize), worker,

            partitioner);

      }

    });


    for (size_t i = 0; i < num_vars_shared_terms; ++i) {

      partials[num_vars_sliced_terms + i] = worker.args_adjoints_.coeff(i);

    }


    if (msgs) {

      *msgs << worker.msgs_.str();

    }


    return var(new precomputed_gradients_vari(

        worker.sum_, num_vars_sliced_terms + num_vars_shared_terms, varis,

        partials));

  }

};

}  // namespace internal


}  // namespace math

}  // namespace stan


#endif

stan::math::ScopedChainableStack
The AD tape of reverse mode AD is by default stored globally within the process (or thread).
Definition scoped_chainablestack.hpp:31

stan::math::nested_rev_autodiff
A class following the RAII idiom to start and recover nested autodiff scopes.
Definition nested_rev_autodiff.hpp:27

stan::math::stack_alloc::alloc_array
T * alloc_array(size_t n)
Allocate an array on the arena of the specified size to hold values of the specified template paramet...
Definition stack_alloc.hpp:193

stan::math::var_value< double >

stan::math::vari_value
Definition vari.hpp:17

stan::require_var_t
require_t< is_var< std::decay_t< T > > > require_var_t
Require type satisfies is_var.
Definition is_var.hpp:24

stan::math::deep_copy_vars
Arith deep_copy_vars(Arith &&arg)
Forward arguments that do not contain vars.
Definition deep_copy_vars.hpp:23

stan::math::count_vars
size_t count_vars(Pargs &&... args)
Count the number of vars in the input argument list.
Definition count_vars.hpp:146

stan::math::save_varis
vari ** save_varis(vari **dest, const var &x, Pargs &&... args)
Save the vari pointer in x into the memory pointed to by dest, increment the dest storage pointer,...
Definition save_varis.hpp:51

stan::math::var
var_value< double > var
Definition var.hpp:1187

stan::math::partials
constexpr auto & partials(internal::partials_propagator< Types... > &x) noexcept
Access the partials for an edge of an partials_propagator
Definition partials_propagator.hpp:95

stan::math::precomputed_gradients_vari
precomputed_gradients_vari_template< std::tuple<>, std::tuple<> > precomputed_gradients_vari
Definition precomputed_gradients.hpp:189

stan::math::accumulate_adjoints
double * accumulate_adjoints(double *dest, const var &x, Pargs &&... args)
Accumulate adjoints from x into storage pointed to by dest, increment the adjoint storage pointer,...
Definition accumulate_adjoints.hpp:51

stan
The lgamma implementation in stan-math is based on either the reentrant safe lgamma_r implementation ...
Definition unit_vector_constrain.hpp:15

std
STL namespace.

functor.hpp

meta.hpp

core.hpp

stan::math::AutodiffStackSingleton::AutodiffStackStorage::memalloc_
stack_alloc memalloc_
Definition autodiffstackstorage.hpp:107

stan::math::AutodiffStackSingleton::instance_
static thread_local AutodiffStackStorage * instance_
Definition autodiffstackstorage.hpp:118

stan::math::internal::reduce_sum_impl< ReduceFunction, require_var_t< ReturnType >, ReturnType, Vec, Args... >::operator()
var operator()(Vec &&vmapped, bool auto_partitioning, int grainsize, std::ostream *msgs, Args &&... args) const
Call an instance of the function ReduceFunction on every element of an input sequence and sum these t...
Definition reduce_sum.hpp:224

stan::math::internal::reduce_sum_impl< ReduceFunction, require_var_t< ReturnType >, ReturnType, Vec, Args... >::scoped_args_tuple::args_tuple_holder_
std::unique_ptr< args_tuple_t > args_tuple_holder_
Definition reduce_sum.hpp:37

stan::math::internal::reduce_sum_impl< ReduceFunction, require_var_t< ReturnType >, ReturnType, Vec, Args... >::scoped_args_tuple::stack_
ScopedChainableStack stack_
Definition reduce_sum.hpp:34

stan::math::internal::reduce_sum_impl< ReduceFunction, require_var_t< ReturnType >, ReturnType, Vec, Args... >::scoped_args_tuple::args_tuple_t
std::tuple< decltype(deep_copy_vars(std::declval< Args >()))... > args_tuple_t
Definition reduce_sum.hpp:36

stan::math::internal::reduce_sum_impl< ReduceFunction, require_var_t< ReturnType >, ReturnType, Vec, Args... >::scoped_args_tuple::scoped_args_tuple
scoped_args_tuple()
Definition reduce_sum.hpp:39

stan::math::internal::reduce_sum_impl< ReduceFunction, require_var_t< ReturnType >, ReturnType, Vec, Args... >::recursive_reducer::num_vars_per_term_
const size_t num_vars_per_term_
Definition reduce_sum.hpp:52

stan::math::internal::reduce_sum_impl< ReduceFunction, require_var_t< ReturnType >, ReturnType, Vec, Args... >::recursive_reducer::recursive_reducer
recursive_reducer(size_t num_vars_per_term, size_t num_vars_shared_terms, double *sliced_partials, VecT &&vmapped, ArgsT &&... args)
Definition reduce_sum.hpp:63

stan::math::internal::reduce_sum_impl< ReduceFunction, require_var_t< ReturnType >, ReturnType, Vec, Args... >::recursive_reducer::local_args_tuple_scope_
scoped_args_tuple local_args_tuple_scope_
Definition reduce_sum.hpp:58

stan::math::internal::reduce_sum_impl< ReduceFunction, require_var_t< ReturnType >, ReturnType, Vec, Args... >::recursive_reducer::recursive_reducer
recursive_reducer(recursive_reducer &other, tbb::split)
Definition reduce_sum.hpp:77

stan::math::internal::reduce_sum_impl< ReduceFunction, require_var_t< ReturnType >, ReturnType, Vec, Args... >::recursive_reducer::vmapped_
Vec vmapped_
Definition reduce_sum.hpp:55

stan::math::internal::reduce_sum_impl< ReduceFunction, require_var_t< ReturnType >, ReturnType, Vec, Args... >::recursive_reducer::operator()
void operator()(const tbb::blocked_range< size_t > &r)
Compute, using nested autodiff, the value and Jacobian of ReduceFunction called over the range define...
Definition reduce_sum.hpp:97

stan::math::internal::reduce_sum_impl< ReduceFunction, require_var_t< ReturnType >, ReturnType, Vec, Args... >::recursive_reducer::join
void join(const recursive_reducer &rhs)
Join reducers.
Definition reduce_sum.hpp:173

stan::math::internal::reduce_sum_impl< ReduceFunction, require_var_t< ReturnType >, ReturnType, Vec, Args... >::recursive_reducer::sliced_partials_
double * sliced_partials_
Definition reduce_sum.hpp:54

stan::math::internal::reduce_sum_impl< ReduceFunction, require_var_t< ReturnType >, ReturnType, Vec, Args... >::recursive_reducer::num_vars_shared_terms_
const size_t num_vars_shared_terms_
Definition reduce_sum.hpp:53

stan::math::internal::reduce_sum_impl< ReduceFunction, require_var_t< ReturnType >, ReturnType, Vec, Args... >::recursive_reducer::args_tuple_
std::tuple< Args... > args_tuple_
Definition reduce_sum.hpp:57

stan::math::internal::reduce_sum_impl< ReduceFunction, require_var_t< ReturnType >, ReturnType, Vec, Args... >::recursive_reducer::msgs_
std::stringstream msgs_
Definition reduce_sum.hpp:56

stan::math::internal::reduce_sum_impl
reduce_sum_impl implementation for any autodiff type.
Definition reduce_sum.hpp:29