heffte/heffte__reshape3d_8h_source.html

 /*

     -- heFFTe --

        Univ. of Tennessee, Knoxville

        @date

 */


 #ifndef HEFFTE_RESHAPE3D_H

 #define HEFFTE_RESHAPE3D_H


 #include "heffte_plan_logic.h"

 #include "heffte_backends.h"


 namespace heffte {


 #ifdef Heffte_ENABLE_CUDA

 namespace gpu { using namespace cuda; }

 #else

 #ifdef Heffte_ENABLE_ROCM

 namespace gpu { using namespace rocm; }

 #endif

 #ifdef Heffte_ENABLE_ONEAPI

 namespace gpu { using namespace oneapi; }

 #endif

 #endif


 template<typename index>

 void compute_overlap_map_transpose_pack(int me, int nprocs, box3d<index> const destination, std::vector<box3d<index>> const &boxes,

                                         std::vector<int> &proc, std::vector<int> &offset, std::vector<int> &sizes, std::vector<pack_plan_3d<index>> &plans);


 template<typename index>

 class reshape3d_base{

 public:

     reshape3d_base(index cinput_size, index coutput_size) : input_size(cinput_size), output_size(coutput_size){};

     virtual ~reshape3d_base() = default;

     virtual void apply(int batch_size, float const source[], float destination[], float workspace[]) const = 0;

     virtual void apply(int batch_size, double const source[], double destination[], double workspace[]) const = 0;

     virtual void apply(int batch_size, std::complex<float> const source[], std::complex<float> destination[], std::complex<float> workspace[]) const = 0;

     virtual void apply(int batch_size, std::complex<double> const source[], std::complex<double> destination[], std::complex<double> workspace[]) const = 0;


     index size_intput() const{ return input_size; }

     index size_output() const{ return output_size; }

     virtual size_t size_workspace() const{ return input_size + output_size; }


 protected:

     index const input_size;

     index const output_size;


     // buffers to be used in the no-gpu-aware algorithm for the temporary cpu storage

     // the no-gpu-aware version alleviate the latency when working with small FFTs

     // hence the cpu buffers will be small and will not cause issues

     // note that the main API accepts a GPU buffer for scratch work and cannot be used here

     template<typename scalar_type> scalar_type* cpu_send_buffer(size_t num_entries) const{

         size_t float_entries = num_entries * sizeof(scalar_type) / sizeof(float);

         send_unaware.resize(float_entries);

         return reinterpret_cast<scalar_type*>(send_unaware.data());

     }

     template<typename scalar_type> scalar_type* cpu_recv_buffer(size_t num_entries) const{

         size_t float_entries = num_entries * sizeof(scalar_type) / sizeof(float);

         recv_unaware.resize(float_entries);

         return reinterpret_cast<scalar_type*>(recv_unaware.data());

     }

     mutable std::vector<float> send_unaware;

     mutable std::vector<float> recv_unaware;

 };


 template<typename index>

 inline size_t get_workspace_size(std::array<std::unique_ptr<reshape3d_base<index>>, 4> const &shapers){

     size_t max_size = 0;

     for(auto const &s : shapers) if (s) max_size = std::max(max_size, s->size_workspace());

     return max_size;

 }


 template<typename location_tag, template<typename device> class packer, typename index>

 class reshape3d_alltoall : public reshape3d_base<index>, public backend::device_instance<location_tag>{

 public:

     ~reshape3d_alltoall(){ mpi::comm_free(comm); }

     template<typename b, template<typename d> class p, typename i> friend std::unique_ptr<reshape3d_alltoall<b, p, i>>

     make_reshape3d_alltoall(typename backend::device_instance<b>::stream_type, std::vector<box3d<i>> const&, std::vector<box3d<i>> const&, bool, MPI_Comm const);


     void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final{

         apply_base(batch_size, source, destination, workspace);

     }

     void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final{

         apply_base(batch_size, source, destination, workspace);

     }

     void apply(int batch_size, std::complex<float> const source[], std::complex<float> destination[], std::complex<float> workspace[]) const override final{

         apply_base(batch_size, source, destination, workspace);

     }

     void apply(int batch_size, std::complex<double> const source[], std::complex<double> destination[], std::complex<double> workspace[]) const override final{

         apply_base(batch_size, source, destination, workspace);

     }


     template<typename scalar_type>

     void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const;


     size_t size_workspace() const override { return 2 * num_entries * packplan.size(); }


 private:

     reshape3d_alltoall(typename backend::device_instance<location_tag>::stream_type q,

                        int input_size, int output_size, bool gpu_aware, MPI_Comm ccomm,

                        std::vector<pack_plan_3d<index>>&&, std::vector<pack_plan_3d<index>>&&,

                        std::vector<int>&&, std::vector<int>&&, int);


     MPI_Comm const comm;

     int const me, nprocs;

     bool const use_gpu_aware;


     std::vector<pack_plan_3d<index>> packplan, unpackplan;

     std::vector<int> send_offset, recv_offset;

     int const num_entries;

 };


 template<typename location_tag, template<typename device> class packer = direct_packer, typename index>

 std::unique_ptr<reshape3d_alltoall<location_tag, packer, index>>

 make_reshape3d_alltoall(typename backend::device_instance<location_tag>::stream_type q,

                         std::vector<box3d<index>> const &input_boxes, std::vector<box3d<index>> const &output_boxes,

                         bool uses_gpu_aware, MPI_Comm const comm);


 template<typename location_tag, template<typename device> class packer, typename index>

 class reshape3d_alltoallv : public reshape3d_base<index>, public backend::device_instance<location_tag>{

 public:

     ~reshape3d_alltoallv(){ mpi::comm_free(comm); }

     template<typename b, template<typename d> class p, typename i> friend std::unique_ptr<reshape3d_alltoallv<b, p, i>>

     make_reshape3d_alltoallv(typename backend::device_instance<b>::stream_type, std::vector<box3d<i>> const&, std::vector<box3d<i>> const&, bool, MPI_Comm const);


     void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final{

         apply_base(batch_size, source, destination, workspace);

     }

     void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final{

         apply_base(batch_size, source, destination, workspace);

     }

     void apply(int batch_size, std::complex<float> const source[], std::complex<float> destination[], std::complex<float> workspace[]) const override final{

         apply_base(batch_size, source, destination, workspace);

     }

     void apply(int batch_size, std::complex<double> const source[], std::complex<double> destination[], std::complex<double> workspace[]) const override final{

         apply_base(batch_size, source, destination, workspace);

     }


     template<typename scalar_type>

     void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const;


 private:

     reshape3d_alltoallv(typename backend::device_instance<location_tag>::stream_type q,

                         int input_size, int output_size,

                         bool gpu_aware, MPI_Comm new_comm, std::vector<int> const &pgroup,

                         std::vector<int> &&send_offset, std::vector<int> &&send_size, std::vector<int> const &send_proc,

                         std::vector<int> &&recv_offset, std::vector<int> &&recv_size, std::vector<int> const &recv_proc,

                         std::vector<pack_plan_3d<index>> &&packplan, std::vector<pack_plan_3d<index>> &&unpackplan);


     MPI_Comm const comm;

     int const me, nprocs;

     bool const use_gpu_aware;


     std::vector<int> const send_offset;   // extraction loc for each send

     std::vector<int> const send_size;     // size of each send message

     std::vector<int> const recv_offset;   // insertion loc for each recv

     std::vector<int> const recv_size;     // size of each recv message

     int const send_total, recv_total;


     std::vector<pack_plan_3d<index>> const packplan, unpackplan;


     struct iotripple{

         std::vector<int> counts, displacements, map;

         iotripple(std::vector<int> const &pgroup, std::vector<int> const &proc, std::vector<int> const &sizes) :

             counts(pgroup.size(), 0), displacements(pgroup.size(), 0), map(pgroup.size(), -1)

         {

             int offset = 0;

             for(size_t src = 0; src < pgroup.size(); src++){

                 for(size_t i=0; i<proc.size(); i++){

                     if (proc[i] != pgroup[src]) continue;

                     counts[src] = sizes[i];

                     displacements[src] = offset;

                     offset += sizes[i];

                     map[src] = i;

                 }

             }

         }


     };


     iotripple const send, recv;

 };


 template<typename location_tag, template<typename device> class packer = direct_packer, typename index>

 std::unique_ptr<reshape3d_alltoallv<location_tag, packer, index>>

 make_reshape3d_alltoallv(typename backend::device_instance<location_tag>::stream_type q,

                          std::vector<box3d<index>> const &input_boxes,

                          std::vector<box3d<index>> const &output_boxes,

                          bool use_gpu_aware,

                          MPI_Comm const comm);


 template<typename location_tag, template<typename device> class packer, typename index>

 class reshape3d_pointtopoint : public reshape3d_base<index>, public backend::device_instance<location_tag>{

 public:

     ~reshape3d_pointtopoint() = default;

     template<typename b, template<typename d> class p, typename i> friend std::unique_ptr<reshape3d_pointtopoint<b, p, i>>

     make_reshape3d_pointtopoint(typename backend::device_instance<b>::stream_type, std::vector<box3d<i>> const&, std::vector<box3d<i>> const&, reshape_algorithm, bool, MPI_Comm const);


     void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final{

         apply_base(batch_size, source, destination, workspace);

     }

     void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final{

         apply_base(batch_size, source, destination, workspace);

     }

     void apply(int batch_size, std::complex<float> const source[], std::complex<float> destination[], std::complex<float> workspace[]) const override final{

         apply_base(batch_size, source, destination, workspace);

     }

     void apply(int batch_size, std::complex<double> const source[], std::complex<double> destination[], std::complex<double> workspace[]) const override final{

         apply_base(batch_size, source, destination, workspace);

     }


     template<typename scalar_type>

     void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const;


     template<typename scalar_type>

     void no_gpuaware_send_recv(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const;


 private:

     reshape3d_pointtopoint(typename backend::device_instance<location_tag>::stream_type stream,

                            int input_size, int output_size, reshape_algorithm alg, bool gpu_aware,  MPI_Comm ccomm,

                            std::vector<int> &&send_offset, std::vector<int> &&send_size, std::vector<int> &&send_proc,

                            std::vector<int> &&recv_offset, std::vector<int> &&recv_size, std::vector<int> &&recv_proc,

                            std::vector<int> &&recv_loc,

                            std::vector<pack_plan_3d<index>> &&packplan, std::vector<pack_plan_3d<index>> &&unpackplan);


     MPI_Comm const comm;

     int const me, nprocs;

     bool const self_to_self;

     reshape_algorithm const algorithm;

     bool const use_gpu_aware;

     mutable std::vector<MPI_Request> requests; // recv_proc.size() requests, but remove one if using self_to_self communication

     mutable std::vector<MPI_Request> isends;


     std::vector<int> const send_proc;     // processor to send towards

     std::vector<int> const send_offset;   // extraction loc for each send

     std::vector<int> const send_size;     // size of each send message

     std::vector<int> const recv_proc;     // processor to receive from

     std::vector<int> const recv_offset;   // insertion loc for each recv

     std::vector<int> const recv_size;     // size of each recv message

     std::vector<int> const recv_loc;      // offset in the receive buffer (recv_offset refers to the the destination buffer)

     int const send_total, recv_total;


     std::vector<pack_plan_3d<index>> const packplan, unpackplan;

     int max_send_size;

 };


 template<typename location_tag, template<typename device> class packer = direct_packer, typename index>

 std::unique_ptr<reshape3d_pointtopoint<location_tag, packer, index>>

 make_reshape3d_pointtopoint(typename backend::device_instance<location_tag>::stream_type q,

                             std::vector<box3d<index>> const &input_boxes,

                             std::vector<box3d<index>> const &output_boxes,

                             reshape_algorithm algorithm, bool use_gpu_aware,

                             MPI_Comm const comm);


 template<typename location_tag, typename index>

 class reshape3d_transpose : public reshape3d_base<index>, public backend::device_instance<location_tag>{

 public:

     reshape3d_transpose(typename backend::device_instance<location_tag>::stream_type q,

                         pack_plan_3d<index> const cplan) :

         reshape3d_base<index>(cplan.size[0] * cplan.size[1] * cplan.size[2], cplan.size[0] * cplan.size[1] * cplan.size[2]),

         backend::device_instance<location_tag>(q),

         plan(cplan)

         {}


     void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final{

         transpose(batch_size, source, destination, workspace);

     }

     void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final{

         transpose(batch_size, source, destination, workspace);

     }

     void apply(int batch_size, std::complex<float> const source[], std::complex<float> destination[], std::complex<float> workspace[]) const override final{

         transpose(batch_size, source, destination, workspace);

     }

     void apply(int batch_size, std::complex<double> const source[], std::complex<double> destination[], std::complex<double> workspace[]) const override final{

         transpose(batch_size, source, destination, workspace);

     }


 private:

     template<typename scalar_type>

     void transpose(int batch_size, scalar_type const *source, scalar_type *destination, scalar_type *workspace) const{

         if (source == destination){ // in-place transpose will need workspace

             backend::data_manipulator<location_tag>::copy_n(this->stream(), source, batch_size * this->input_size, workspace);

             for(int j=0; j<batch_size; j++)

                 transpose_packer<location_tag>().unpack(this->stream(), plan, workspace + j * this->input_size,

                                                         destination + j * this->input_size);

         }else{

             for(int j=0; j<batch_size; j++)

                 transpose_packer<location_tag>().unpack(this->stream(), plan, source + j * this->input_size,

                                                         destination + j * this->input_size);

         }

     }


     pack_plan_3d<index> const plan;

 };


 template<typename backend_tag, typename index>

 std::unique_ptr<reshape3d_base<index>> make_reshape3d(typename backend::device_instance<typename backend::buffer_traits<backend_tag>::location>::stream_type stream,

                                                std::vector<box3d<index>> const &input_boxes,

                                                std::vector<box3d<index>> const &output_boxes,

                                                MPI_Comm const comm,

                                                plan_options const options){

     using location_tag = typename backend::buffer_traits<backend_tag>::location;


     if (match(input_boxes, output_boxes)){

         if (input_boxes[0].ordered_same_as(output_boxes[0])){

             return std::unique_ptr<reshape3d_base<index>>();

         }else{

             int const me = mpi::comm_rank(comm);

             std::vector<int> proc, offset, sizes;

             std::vector<pack_plan_3d<index>> plans;


             compute_overlap_map_transpose_pack(0, 1, output_boxes[me], {input_boxes[me]}, proc, offset, sizes, plans);


             if (not plans.empty()){

                 return std::unique_ptr<reshape3d_base<index>>(new reshape3d_transpose<location_tag, index >(stream, plans[0]));

             }else{

                 // when the number of indexes is very small, the current box can be empty

                 return std::unique_ptr<reshape3d_base<index>>();

             }

         }

     }else{

         if (options.algorithm == reshape_algorithm::alltoallv){

             if (input_boxes[0].ordered_same_as(output_boxes[0])){

                 return make_reshape3d_alltoallv<location_tag, direct_packer, index>(stream, input_boxes, output_boxes,

                                                                                     options.use_gpu_aware, comm);

             }else{

                 return make_reshape3d_alltoallv<location_tag, transpose_packer, index>(stream, input_boxes, output_boxes,

                                                                                        options.use_gpu_aware, comm);

             }

         }else if (options.algorithm == reshape_algorithm::alltoall){

             if (input_boxes[0].ordered_same_as(output_boxes[0])){

                 return make_reshape3d_alltoall<location_tag, direct_packer, index>(stream, input_boxes, output_boxes,

                                                                                    options.use_gpu_aware, comm);

             }else{

                 return make_reshape3d_alltoall<location_tag, transpose_packer, index>(stream, input_boxes, output_boxes,

                                                                                       options.use_gpu_aware, comm);

             }

         }else{

             if (input_boxes[0].ordered_same_as(output_boxes[0])){

                 return make_reshape3d_pointtopoint<location_tag, direct_packer, index>(stream, input_boxes, output_boxes,

                                                                                        options.algorithm, options.use_gpu_aware, comm);

             }else{

                 return make_reshape3d_pointtopoint<location_tag, transpose_packer, index>(stream, input_boxes, output_boxes,

                                                                                           options.algorithm, options.use_gpu_aware, comm);

             }

         }

     }

 }


 }


 #endif

heffte::reshape3d_alltoall
Reshape algorithm based on the MPI_Alltoall() method.
Definition: heffte_reshape3d.h:133

heffte::reshape3d_alltoall::apply
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition: heffte_reshape3d.h:154

heffte::reshape3d_alltoall::~reshape3d_alltoall
~reshape3d_alltoall()
Destructor, frees the comm generated by the constructor.
Definition: heffte_reshape3d.h:136

heffte::reshape3d_alltoall::apply
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition: heffte_reshape3d.h:150

heffte::reshape3d_alltoall::size_workspace
size_t size_workspace() const override
The size of the workspace must include padding.
Definition: heffte_reshape3d.h:163

heffte::reshape3d_alltoall::apply
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition: heffte_reshape3d.h:142

heffte::reshape3d_alltoall::apply_base
void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_alltoallv::apply() algorithm for all scalar types.

heffte::reshape3d_alltoall::apply
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition: heffte_reshape3d.h:146

heffte::reshape3d_alltoall::make_reshape3d_alltoall
friend std::unique_ptr< reshape3d_alltoall< b, p, i > > make_reshape3d_alltoall(typename backend::device_instance< b >::stream_type, std::vector< box3d< i >> const &, std::vector< box3d< i >> const &, bool, MPI_Comm const)
Factory method, use to construct instances of the class.

heffte::reshape3d_alltoallv
Reshape algorithm based on the MPI_Alltoallv() method.
Definition: heffte_reshape3d.h:226

heffte::reshape3d_alltoallv::make_reshape3d_alltoallv
friend std::unique_ptr< reshape3d_alltoallv< b, p, i > > make_reshape3d_alltoallv(typename backend::device_instance< b >::stream_type, std::vector< box3d< i >> const &, std::vector< box3d< i >> const &, bool, MPI_Comm const)
Factory method, use to construct instances of the class.

heffte::reshape3d_alltoallv::apply
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition: heffte_reshape3d.h:247

heffte::reshape3d_alltoallv::apply
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition: heffte_reshape3d.h:243

heffte::reshape3d_alltoallv::apply
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition: heffte_reshape3d.h:239

heffte::reshape3d_alltoallv::~reshape3d_alltoallv
~reshape3d_alltoallv()
Destructor, frees the comm generated by the constructor.
Definition: heffte_reshape3d.h:229

heffte::reshape3d_alltoallv::apply_base
void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_alltoallv::apply() algorithm for all scalar types.

heffte::reshape3d_alltoallv::apply
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition: heffte_reshape3d.h:235

heffte::reshape3d_base
Base reshape interface.
Definition: heffte_reshape3d.h:60

heffte::reshape3d_base::cpu_send_buffer
scalar_type * cpu_send_buffer(size_t num_entries) const
Allocates and returns a CPU buffer when GPU-Aware communication has been disabled.
Definition: heffte_reshape3d.h:93

heffte::reshape3d_base::reshape3d_base
reshape3d_base(index cinput_size, index coutput_size)
Constructor that sets the input and output sizes.
Definition: heffte_reshape3d.h:63

heffte::reshape3d_base::cpu_recv_buffer
scalar_type * cpu_recv_buffer(size_t num_entries) const
Allocates and returns a CPU buffer when GPU-Aware communication has been disabled.
Definition: heffte_reshape3d.h:99

heffte::reshape3d_base::output_size
index const output_size
Stores the size of the output.
Definition: heffte_reshape3d.h:86

heffte::reshape3d_base::apply
virtual void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const =0
Apply the reshape, double precision complex.

heffte::reshape3d_base::~reshape3d_base
virtual ~reshape3d_base()=default
Default virtual destructor.

heffte::reshape3d_base::size_intput
index size_intput() const
Returns the input size.
Definition: heffte_reshape3d.h:76

heffte::reshape3d_base::apply
virtual void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const =0
Apply the reshape, single precision complex.

heffte::reshape3d_base::apply
virtual void apply(int batch_size, double const source[], double destination[], double workspace[]) const =0
Apply the reshape, double precision.

heffte::reshape3d_base::input_size
index const input_size
Stores the size of the input.
Definition: heffte_reshape3d.h:84

heffte::reshape3d_base::size_output
index size_output() const
Returns the output size.
Definition: heffte_reshape3d.h:78

heffte::reshape3d_base::apply
virtual void apply(int batch_size, float const source[], float destination[], float workspace[]) const =0
Apply the reshape, single precision.

heffte::reshape3d_base::send_unaware
std::vector< float > send_unaware
Temp buffers for the gpu-unaware algorithms.
Definition: heffte_reshape3d.h:105

heffte::reshape3d_base::size_workspace
virtual size_t size_workspace() const
Returns the workspace size.
Definition: heffte_reshape3d.h:80

heffte::reshape3d_base::recv_unaware
std::vector< float > recv_unaware
Temp buffers for the gpu-unaware algorithms.
Definition: heffte_reshape3d.h:107

heffte::reshape3d_pointtopoint
Reshape algorithm based on the MPI_Send() and MPI_Irecv() methods.
Definition: heffte_reshape3d.h:341

heffte::reshape3d_pointtopoint::~reshape3d_pointtopoint
~reshape3d_pointtopoint()=default
Destructor, frees the comm generated by the constructor.

heffte::reshape3d_pointtopoint::apply
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition: heffte_reshape3d.h:350

heffte::reshape3d_pointtopoint::make_reshape3d_pointtopoint
friend std::unique_ptr< reshape3d_pointtopoint< b, p, i > > make_reshape3d_pointtopoint(typename backend::device_instance< b >::stream_type, std::vector< box3d< i >> const &, std::vector< box3d< i >> const &, reshape_algorithm, bool, MPI_Comm const)
Factory method, use to construct instances of the class.

heffte::reshape3d_pointtopoint::no_gpuaware_send_recv
void no_gpuaware_send_recv(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_pointtopoint::apply() algorithm that does not use GPU-Aware MPI.

heffte::reshape3d_pointtopoint::apply_base
void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_pointtopoint::apply() algorithm for all scalar types.

heffte::reshape3d_pointtopoint::apply
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition: heffte_reshape3d.h:358

heffte::reshape3d_pointtopoint::apply
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition: heffte_reshape3d.h:362

heffte::reshape3d_pointtopoint::apply
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition: heffte_reshape3d.h:354

heffte::reshape3d_transpose
Special case of the reshape that does not involve MPI communication but applies a transpose instead.
Definition: heffte_reshape3d.h:444

heffte::reshape3d_transpose::reshape3d_transpose
reshape3d_transpose(typename backend::device_instance< location_tag >::stream_type q, pack_plan_3d< index > const cplan)
Constructor using the provided unpack plan.
Definition: heffte_reshape3d.h:447

heffte::reshape3d_transpose::apply
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition: heffte_reshape3d.h:467

heffte::reshape3d_transpose::apply
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition: heffte_reshape3d.h:455

heffte::reshape3d_transpose::apply
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition: heffte_reshape3d.h:463

heffte::reshape3d_transpose::apply
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition: heffte_reshape3d.h:459

heffte::reshape_algorithm
reshape_algorithm
Defines list of potential communication algorithms.
Definition: heffte_plan_logic.h:48

heffte::reshape_algorithm::alltoall
@ alltoall
Using the MPI_Alltoall options, with padding on the data.

heffte::reshape_algorithm::alltoallv
@ alltoallv
Using the MPI_Alltoallv options, no padding on the data (default option).

heffte::match
bool match(std::vector< box3d< index >> const &shape0, std::vector< box3d< index >> const &shape1)
Compares two vectors of boxes, returns true if all boxes match.
Definition: heffte_geometry.h:246

heffte::mpi::comm_rank
int comm_rank(MPI_Comm const comm)
Returns the rank of this process within the specified comm.
Definition: heffte_utils.h:78

heffte::mpi::comm_free
void comm_free(MPI_Comm const comm)
Calls free on the MPI comm.
Definition: heffte_utils.h:174

heffte::compute_overlap_map_transpose_pack
void compute_overlap_map_transpose_pack(int me, int nprocs, box3d< index > const destination, std::vector< box3d< index >> const &boxes, std::vector< int > &proc, std::vector< int > &offset, std::vector< int > &sizes, std::vector< pack_plan_3d< index >> &plans)
Generates an unpack plan where the boxes and the destination do not have the same order.

heffte::make_reshape3d
std::unique_ptr< reshape3d_base< index > > make_reshape3d(typename backend::device_instance< typename backend::buffer_traits< backend_tag >::location >::stream_type stream, std::vector< box3d< index >> const &input_boxes, std::vector< box3d< index >> const &output_boxes, MPI_Comm const comm, plan_options const options)
Factory method to create a reshape3d instance.
Definition: heffte_reshape3d.h:505

heffte::make_reshape3d_alltoall
std::unique_ptr< reshape3d_alltoall< location_tag, packer, index > > make_reshape3d_alltoall(typename backend::device_instance< location_tag >::stream_type q, std::vector< box3d< index >> const &input_boxes, std::vector< box3d< index >> const &output_boxes, bool uses_gpu_aware, MPI_Comm const comm)
Factory method that all the necessary work to establish the communication patterns.

heffte::make_reshape3d_alltoallv
std::unique_ptr< reshape3d_alltoallv< location_tag, packer, index > > make_reshape3d_alltoallv(typename backend::device_instance< location_tag >::stream_type q, std::vector< box3d< index >> const &input_boxes, std::vector< box3d< index >> const &output_boxes, bool use_gpu_aware, MPI_Comm const comm)
Factory method that all the necessary work to establish the communication patterns.

heffte::make_reshape3d_pointtopoint
std::unique_ptr< reshape3d_pointtopoint< location_tag, packer, index > > make_reshape3d_pointtopoint(typename backend::device_instance< location_tag >::stream_type q, std::vector< box3d< index >> const &input_boxes, std::vector< box3d< index >> const &output_boxes, reshape_algorithm algorithm, bool use_gpu_aware, MPI_Comm const comm)
Factory method that all the necessary work to establish the communication patterns.

heffte::get_workspace_size
size_t get_workspace_size(std::array< std::unique_ptr< reshape3d_base< index >>, 4 > const &shapers)
Returns the maximum workspace size used by the shapers.
Definition: heffte_reshape3d.h:115

heffte
Namespace containing all HeFFTe methods and classes.
Definition: heffte_backend_cuda.h:38

heffte::backend::data_manipulator
Common data-transfer operations, must be specializes for each location (cpu/gpu).
Definition: heffte_common.h:59

heffte::backend::device_instance
Holds the auxiliary variables needed by each backend.
Definition: heffte_common.h:358

heffte::backend::device_instance::stream_type
void * stream_type
The type for the internal stream, the cpu uses just a void pointer.
Definition: heffte_common.h:370

heffte::box3d
A generic container that describes a 3d box of indexes.
Definition: heffte_geometry.h:67

heffte::direct_packer
Defines the direct packer without implementation, use the specializations to get the CPU or GPU imple...
Definition: heffte_pack3d.h:83

heffte::pack_plan_3d
Holds the plan for a pack/unpack operation.
Definition: heffte_pack3d.h:32

heffte::plan_options
Defines a set of tweaks and options to use in the plan generation.
Definition: heffte_plan_logic.h:131

heffte::plan_options::algorithm
reshape_algorithm algorithm
Defines the communication algorithm.
Definition: heffte_plan_logic.h:148

heffte::plan_options::use_gpu_aware
bool use_gpu_aware
Defines whether to use MPI calls directly from the GPU or to move to the CPU first.
Definition: heffte_plan_logic.h:152

heffte::tag::cpu
Indicates the use of cpu backend and that all input/output data and arrays will be bound to the cpu.
Definition: heffte_common.h:38

heffte::transpose_packer
Defines the transpose packer without implementation, use the specializations to get the CPU implement...
Definition: heffte_pack3d.h:116