Highly Efficient FFT for Exascale: HeFFTe v2.3
heffte_reshape3d.h
1 /*
2  -- heFFTe --
3  Univ. of Tennessee, Knoxville
4  @date
5 */
6 
7 #ifndef HEFFTE_RESHAPE3D_H
8 #define HEFFTE_RESHAPE3D_H
9 
10 #include "heffte_plan_logic.h"
11 #include "heffte_backends.h"
12 
29 namespace heffte {
30 
31 #ifdef Heffte_ENABLE_CUDA
32 namespace gpu { using namespace cuda; }
33 #else
34 #ifdef Heffte_ENABLE_ROCM
35 namespace gpu { using namespace rocm; }
36 #endif
37 #ifdef Heffte_ENABLE_ONEAPI
38 namespace gpu { using namespace oneapi; }
39 #endif
40 #endif
41 
51 template<typename index>
52 void compute_overlap_map_transpose_pack(int me, int nprocs, box3d<index> const destination, std::vector<box3d<index>> const &boxes,
53  std::vector<int> &proc, std::vector<int> &offset, std::vector<int> &sizes, std::vector<pack_plan_3d<index>> &plans);
54 
59 template<typename index>
61 public:
63  reshape3d_base(index cinput_size, index coutput_size) : input_size(cinput_size), output_size(coutput_size){};
65  virtual ~reshape3d_base() = default;
67  virtual void apply(int batch_size, float const source[], float destination[], float workspace[]) const = 0;
69  virtual void apply(int batch_size, double const source[], double destination[], double workspace[]) const = 0;
71  virtual void apply(int batch_size, std::complex<float> const source[], std::complex<float> destination[], std::complex<float> workspace[]) const = 0;
73  virtual void apply(int batch_size, std::complex<double> const source[], std::complex<double> destination[], std::complex<double> workspace[]) const = 0;
74 
76  index size_intput() const{ return input_size; }
78  index size_output() const{ return output_size; }
80  virtual size_t size_workspace() const{ return input_size + output_size; }
81 
82 protected:
84  index const input_size;
86  index const output_size;
87 
88  // buffers to be used in the no-gpu-aware algorithm for the temporary cpu storage
89  // the no-gpu-aware version alleviate the latency when working with small FFTs
90  // hence the cpu buffers will be small and will not cause issues
91  // note that the main API accepts a GPU buffer for scratch work and cannot be used here
93  template<typename scalar_type> scalar_type* cpu_send_buffer(size_t num_entries) const{
94  size_t float_entries = num_entries * sizeof(scalar_type) / sizeof(float);
95  send_unaware.resize(float_entries);
96  return reinterpret_cast<scalar_type*>(send_unaware.data());
97  }
99  template<typename scalar_type> scalar_type* cpu_recv_buffer(size_t num_entries) const{
100  size_t float_entries = num_entries * sizeof(scalar_type) / sizeof(float);
101  recv_unaware.resize(float_entries);
102  return reinterpret_cast<scalar_type*>(recv_unaware.data());
103  }
105  mutable std::vector<float> send_unaware;
107  mutable std::vector<float> recv_unaware;
108 };
109 
114 template<typename index>
115 inline size_t get_workspace_size(std::array<std::unique_ptr<reshape3d_base<index>>, 4> const &shapers){
116  size_t max_size = 0;
117  for(auto const &s : shapers) if (s) max_size = std::max(max_size, s->size_workspace());
118  return max_size;
119 }
120 
132 template<typename location_tag, template<typename device> class packer, typename index>
133 class reshape3d_alltoall : public reshape3d_base<index>, public backend::device_instance<location_tag>{
134 public:
138  template<typename b, template<typename d> class p, typename i> friend std::unique_ptr<reshape3d_alltoall<b, p, i>>
139  make_reshape3d_alltoall(typename backend::device_instance<b>::stream_type, std::vector<box3d<i>> const&, std::vector<box3d<i>> const&, bool, MPI_Comm const);
140 
142  void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final{
143  apply_base(batch_size, source, destination, workspace);
144  }
146  void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final{
147  apply_base(batch_size, source, destination, workspace);
148  }
150  void apply(int batch_size, std::complex<float> const source[], std::complex<float> destination[], std::complex<float> workspace[]) const override final{
151  apply_base(batch_size, source, destination, workspace);
152  }
154  void apply(int batch_size, std::complex<double> const source[], std::complex<double> destination[], std::complex<double> workspace[]) const override final{
155  apply_base(batch_size, source, destination, workspace);
156  }
157 
159  template<typename scalar_type>
160  void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const;
161 
163  size_t size_workspace() const override { return 2 * num_entries * packplan.size(); }
164 
165 private:
170  int input_size, int output_size, bool gpu_aware, MPI_Comm ccomm,
171  std::vector<pack_plan_3d<index>>&&, std::vector<pack_plan_3d<index>>&&,
172  std::vector<int>&&, std::vector<int>&&, int);
173 
174  MPI_Comm const comm;
175  int const me, nprocs;
176  bool const use_gpu_aware;
177 
178  std::vector<pack_plan_3d<index>> packplan, unpackplan;
179  std::vector<int> send_offset, recv_offset;
180  int const num_entries;
181 };
182 
205 template<typename location_tag, template<typename device> class packer = direct_packer, typename index>
206 std::unique_ptr<reshape3d_alltoall<location_tag, packer, index>>
208  std::vector<box3d<index>> const &input_boxes, std::vector<box3d<index>> const &output_boxes,
209  bool uses_gpu_aware, MPI_Comm const comm);
210 
225 template<typename location_tag, template<typename device> class packer, typename index>
226 class reshape3d_alltoallv : public reshape3d_base<index>, public backend::device_instance<location_tag>{
227 public:
231  template<typename b, template<typename d> class p, typename i> friend std::unique_ptr<reshape3d_alltoallv<b, p, i>>
232  make_reshape3d_alltoallv(typename backend::device_instance<b>::stream_type, std::vector<box3d<i>> const&, std::vector<box3d<i>> const&, bool, MPI_Comm const);
233 
235  void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final{
236  apply_base(batch_size, source, destination, workspace);
237  }
239  void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final{
240  apply_base(batch_size, source, destination, workspace);
241  }
243  void apply(int batch_size, std::complex<float> const source[], std::complex<float> destination[], std::complex<float> workspace[]) const override final{
244  apply_base(batch_size, source, destination, workspace);
245  }
247  void apply(int batch_size, std::complex<double> const source[], std::complex<double> destination[], std::complex<double> workspace[]) const override final{
248  apply_base(batch_size, source, destination, workspace);
249  }
250 
252  template<typename scalar_type>
253  void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const;
254 
255 private:
260  int input_size, int output_size,
261  bool gpu_aware, MPI_Comm new_comm, std::vector<int> const &pgroup,
262  std::vector<int> &&send_offset, std::vector<int> &&send_size, std::vector<int> const &send_proc,
263  std::vector<int> &&recv_offset, std::vector<int> &&recv_size, std::vector<int> const &recv_proc,
264  std::vector<pack_plan_3d<index>> &&packplan, std::vector<pack_plan_3d<index>> &&unpackplan);
265 
266  MPI_Comm const comm;
267  int const me, nprocs;
268  bool const use_gpu_aware;
269 
270  std::vector<int> const send_offset; // extraction loc for each send
271  std::vector<int> const send_size; // size of each send message
272  std::vector<int> const recv_offset; // insertion loc for each recv
273  std::vector<int> const recv_size; // size of each recv message
274  int const send_total, recv_total;
275 
276  std::vector<pack_plan_3d<index>> const packplan, unpackplan;
277 
278  struct iotripple{
279  std::vector<int> counts, displacements, map;
280  iotripple(std::vector<int> const &pgroup, std::vector<int> const &proc, std::vector<int> const &sizes) :
281  counts(pgroup.size(), 0), displacements(pgroup.size(), 0), map(pgroup.size(), -1)
282  {
283  int offset = 0;
284  for(size_t src = 0; src < pgroup.size(); src++){
285  for(size_t i=0; i<proc.size(); i++){
286  if (proc[i] != pgroup[src]) continue;
287  counts[src] = sizes[i];
288  displacements[src] = offset;
289  offset += sizes[i];
290  map[src] = i;
291  }
292  }
293  }
294 
295  };
296 
297  iotripple const send, recv;
298 };
299 
322 template<typename location_tag, template<typename device> class packer = direct_packer, typename index>
323 std::unique_ptr<reshape3d_alltoallv<location_tag, packer, index>>
325  std::vector<box3d<index>> const &input_boxes,
326  std::vector<box3d<index>> const &output_boxes,
327  bool use_gpu_aware,
328  MPI_Comm const comm);
329 
340 template<typename location_tag, template<typename device> class packer, typename index>
341 class reshape3d_pointtopoint : public reshape3d_base<index>, public backend::device_instance<location_tag>{
342 public:
346  template<typename b, template<typename d> class p, typename i> friend std::unique_ptr<reshape3d_pointtopoint<b, p, i>>
347  make_reshape3d_pointtopoint(typename backend::device_instance<b>::stream_type, std::vector<box3d<i>> const&, std::vector<box3d<i>> const&, reshape_algorithm, bool, MPI_Comm const);
348 
350  void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final{
351  apply_base(batch_size, source, destination, workspace);
352  }
354  void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final{
355  apply_base(batch_size, source, destination, workspace);
356  }
358  void apply(int batch_size, std::complex<float> const source[], std::complex<float> destination[], std::complex<float> workspace[]) const override final{
359  apply_base(batch_size, source, destination, workspace);
360  }
362  void apply(int batch_size, std::complex<double> const source[], std::complex<double> destination[], std::complex<double> workspace[]) const override final{
363  apply_base(batch_size, source, destination, workspace);
364  }
365 
367  template<typename scalar_type>
368  void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const;
369 
371  template<typename scalar_type>
372  void no_gpuaware_send_recv(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const;
373 
374 private:
379  int input_size, int output_size, reshape_algorithm alg, bool gpu_aware, MPI_Comm ccomm,
380  std::vector<int> &&send_offset, std::vector<int> &&send_size, std::vector<int> &&send_proc,
381  std::vector<int> &&recv_offset, std::vector<int> &&recv_size, std::vector<int> &&recv_proc,
382  std::vector<int> &&recv_loc,
383  std::vector<pack_plan_3d<index>> &&packplan, std::vector<pack_plan_3d<index>> &&unpackplan);
384 
385  MPI_Comm const comm;
386  int const me, nprocs;
387  bool const self_to_self;
388  reshape_algorithm const algorithm;
389  bool const use_gpu_aware;
390  mutable std::vector<MPI_Request> requests; // recv_proc.size() requests, but remove one if using self_to_self communication
391  mutable std::vector<MPI_Request> isends;
392 
393  std::vector<int> const send_proc; // processor to send towards
394  std::vector<int> const send_offset; // extraction loc for each send
395  std::vector<int> const send_size; // size of each send message
396  std::vector<int> const recv_proc; // processor to receive from
397  std::vector<int> const recv_offset; // insertion loc for each recv
398  std::vector<int> const recv_size; // size of each recv message
399  std::vector<int> const recv_loc; // offset in the receive buffer (recv_offset refers to the the destination buffer)
400  int const send_total, recv_total;
401 
402  std::vector<pack_plan_3d<index>> const packplan, unpackplan;
403  int max_send_size;
404 };
405 
429 template<typename location_tag, template<typename device> class packer = direct_packer, typename index>
430 std::unique_ptr<reshape3d_pointtopoint<location_tag, packer, index>>
432  std::vector<box3d<index>> const &input_boxes,
433  std::vector<box3d<index>> const &output_boxes,
434  reshape_algorithm algorithm, bool use_gpu_aware,
435  MPI_Comm const comm);
436 
443 template<typename location_tag, typename index>
444 class reshape3d_transpose : public reshape3d_base<index>, public backend::device_instance<location_tag>{
445 public:
448  pack_plan_3d<index> const cplan) :
449  reshape3d_base<index>(cplan.size[0] * cplan.size[1] * cplan.size[2], cplan.size[0] * cplan.size[1] * cplan.size[2]),
450  backend::device_instance<location_tag>(q),
451  plan(cplan)
452  {}
453 
455  void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final{
456  transpose(batch_size, source, destination, workspace);
457  }
459  void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final{
460  transpose(batch_size, source, destination, workspace);
461  }
463  void apply(int batch_size, std::complex<float> const source[], std::complex<float> destination[], std::complex<float> workspace[]) const override final{
464  transpose(batch_size, source, destination, workspace);
465  }
467  void apply(int batch_size, std::complex<double> const source[], std::complex<double> destination[], std::complex<double> workspace[]) const override final{
468  transpose(batch_size, source, destination, workspace);
469  }
470 
471 private:
472  template<typename scalar_type>
473  void transpose(int batch_size, scalar_type const *source, scalar_type *destination, scalar_type *workspace) const{
474  if (source == destination){ // in-place transpose will need workspace
475  backend::data_manipulator<location_tag>::copy_n(this->stream(), source, batch_size * this->input_size, workspace);
476  for(int j=0; j<batch_size; j++)
477  transpose_packer<location_tag>().unpack(this->stream(), plan, workspace + j * this->input_size,
478  destination + j * this->input_size);
479  }else{
480  for(int j=0; j<batch_size; j++)
481  transpose_packer<location_tag>().unpack(this->stream(), plan, source + j * this->input_size,
482  destination + j * this->input_size);
483  }
484  }
485 
486  pack_plan_3d<index> const plan;
487 };
488 
504 template<typename backend_tag, typename index>
505 std::unique_ptr<reshape3d_base<index>> make_reshape3d(typename backend::device_instance<typename backend::buffer_traits<backend_tag>::location>::stream_type stream,
506  std::vector<box3d<index>> const &input_boxes,
507  std::vector<box3d<index>> const &output_boxes,
508  MPI_Comm const comm,
509  plan_options const options){
510  using location_tag = typename backend::buffer_traits<backend_tag>::location;
511 
512  if (match(input_boxes, output_boxes)){
513  if (input_boxes[0].ordered_same_as(output_boxes[0])){
514  return std::unique_ptr<reshape3d_base<index>>();
515  }else{
516  int const me = mpi::comm_rank(comm);
517  std::vector<int> proc, offset, sizes;
518  std::vector<pack_plan_3d<index>> plans;
519 
520  compute_overlap_map_transpose_pack(0, 1, output_boxes[me], {input_boxes[me]}, proc, offset, sizes, plans);
521 
522  if (not plans.empty()){
523  return std::unique_ptr<reshape3d_base<index>>(new reshape3d_transpose<location_tag, index >(stream, plans[0]));
524  }else{
525  // when the number of indexes is very small, the current box can be empty
526  return std::unique_ptr<reshape3d_base<index>>();
527  }
528  }
529  }else{
530  if (options.algorithm == reshape_algorithm::alltoallv){
531  if (input_boxes[0].ordered_same_as(output_boxes[0])){
532  return make_reshape3d_alltoallv<location_tag, direct_packer, index>(stream, input_boxes, output_boxes,
533  options.use_gpu_aware, comm);
534  }else{
535  return make_reshape3d_alltoallv<location_tag, transpose_packer, index>(stream, input_boxes, output_boxes,
536  options.use_gpu_aware, comm);
537  }
538  }else if (options.algorithm == reshape_algorithm::alltoall){
539  if (input_boxes[0].ordered_same_as(output_boxes[0])){
540  return make_reshape3d_alltoall<location_tag, direct_packer, index>(stream, input_boxes, output_boxes,
541  options.use_gpu_aware, comm);
542  }else{
543  return make_reshape3d_alltoall<location_tag, transpose_packer, index>(stream, input_boxes, output_boxes,
544  options.use_gpu_aware, comm);
545  }
546  }else{
547  if (input_boxes[0].ordered_same_as(output_boxes[0])){
548  return make_reshape3d_pointtopoint<location_tag, direct_packer, index>(stream, input_boxes, output_boxes,
549  options.algorithm, options.use_gpu_aware, comm);
550  }else{
551  return make_reshape3d_pointtopoint<location_tag, transpose_packer, index>(stream, input_boxes, output_boxes,
552  options.algorithm, options.use_gpu_aware, comm);
553  }
554  }
555  }
556 }
557 
558 }
559 
560 #endif
Reshape algorithm based on the MPI_Alltoall() method.
Definition: heffte_reshape3d.h:133
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition: heffte_reshape3d.h:154
~reshape3d_alltoall()
Destructor, frees the comm generated by the constructor.
Definition: heffte_reshape3d.h:136
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition: heffte_reshape3d.h:150
size_t size_workspace() const override
The size of the workspace must include padding.
Definition: heffte_reshape3d.h:163
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition: heffte_reshape3d.h:142
void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_alltoallv::apply() algorithm for all scalar types.
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition: heffte_reshape3d.h:146
friend std::unique_ptr< reshape3d_alltoall< b, p, i > > make_reshape3d_alltoall(typename backend::device_instance< b >::stream_type, std::vector< box3d< i >> const &, std::vector< box3d< i >> const &, bool, MPI_Comm const)
Factory method, use to construct instances of the class.
Reshape algorithm based on the MPI_Alltoallv() method.
Definition: heffte_reshape3d.h:226
friend std::unique_ptr< reshape3d_alltoallv< b, p, i > > make_reshape3d_alltoallv(typename backend::device_instance< b >::stream_type, std::vector< box3d< i >> const &, std::vector< box3d< i >> const &, bool, MPI_Comm const)
Factory method, use to construct instances of the class.
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition: heffte_reshape3d.h:247
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition: heffte_reshape3d.h:243
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition: heffte_reshape3d.h:239
~reshape3d_alltoallv()
Destructor, frees the comm generated by the constructor.
Definition: heffte_reshape3d.h:229
void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_alltoallv::apply() algorithm for all scalar types.
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition: heffte_reshape3d.h:235
Base reshape interface.
Definition: heffte_reshape3d.h:60
scalar_type * cpu_send_buffer(size_t num_entries) const
Allocates and returns a CPU buffer when GPU-Aware communication has been disabled.
Definition: heffte_reshape3d.h:93
reshape3d_base(index cinput_size, index coutput_size)
Constructor that sets the input and output sizes.
Definition: heffte_reshape3d.h:63
scalar_type * cpu_recv_buffer(size_t num_entries) const
Allocates and returns a CPU buffer when GPU-Aware communication has been disabled.
Definition: heffte_reshape3d.h:99
index const output_size
Stores the size of the output.
Definition: heffte_reshape3d.h:86
virtual void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const =0
Apply the reshape, double precision complex.
virtual ~reshape3d_base()=default
Default virtual destructor.
index size_intput() const
Returns the input size.
Definition: heffte_reshape3d.h:76
virtual void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const =0
Apply the reshape, single precision complex.
virtual void apply(int batch_size, double const source[], double destination[], double workspace[]) const =0
Apply the reshape, double precision.
index const input_size
Stores the size of the input.
Definition: heffte_reshape3d.h:84
index size_output() const
Returns the output size.
Definition: heffte_reshape3d.h:78
virtual void apply(int batch_size, float const source[], float destination[], float workspace[]) const =0
Apply the reshape, single precision.
std::vector< float > send_unaware
Temp buffers for the gpu-unaware algorithms.
Definition: heffte_reshape3d.h:105
virtual size_t size_workspace() const
Returns the workspace size.
Definition: heffte_reshape3d.h:80
std::vector< float > recv_unaware
Temp buffers for the gpu-unaware algorithms.
Definition: heffte_reshape3d.h:107
Reshape algorithm based on the MPI_Send() and MPI_Irecv() methods.
Definition: heffte_reshape3d.h:341
~reshape3d_pointtopoint()=default
Destructor, frees the comm generated by the constructor.
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition: heffte_reshape3d.h:350
friend std::unique_ptr< reshape3d_pointtopoint< b, p, i > > make_reshape3d_pointtopoint(typename backend::device_instance< b >::stream_type, std::vector< box3d< i >> const &, std::vector< box3d< i >> const &, reshape_algorithm, bool, MPI_Comm const)
Factory method, use to construct instances of the class.
void no_gpuaware_send_recv(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_pointtopoint::apply() algorithm that does not use GPU-Aware MPI.
void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_pointtopoint::apply() algorithm for all scalar types.
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition: heffte_reshape3d.h:358
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition: heffte_reshape3d.h:362
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition: heffte_reshape3d.h:354
Special case of the reshape that does not involve MPI communication but applies a transpose instead.
Definition: heffte_reshape3d.h:444
reshape3d_transpose(typename backend::device_instance< location_tag >::stream_type q, pack_plan_3d< index > const cplan)
Constructor using the provided unpack plan.
Definition: heffte_reshape3d.h:447
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition: heffte_reshape3d.h:467
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition: heffte_reshape3d.h:455
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition: heffte_reshape3d.h:463
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition: heffte_reshape3d.h:459
reshape_algorithm
Defines list of potential communication algorithms.
Definition: heffte_plan_logic.h:48
@ alltoall
Using the MPI_Alltoall options, with padding on the data.
@ alltoallv
Using the MPI_Alltoallv options, no padding on the data (default option).
bool match(std::vector< box3d< index >> const &shape0, std::vector< box3d< index >> const &shape1)
Compares two vectors of boxes, returns true if all boxes match.
Definition: heffte_geometry.h:246
int comm_rank(MPI_Comm const comm)
Returns the rank of this process within the specified comm.
Definition: heffte_utils.h:78
void comm_free(MPI_Comm const comm)
Calls free on the MPI comm.
Definition: heffte_utils.h:174
void compute_overlap_map_transpose_pack(int me, int nprocs, box3d< index > const destination, std::vector< box3d< index >> const &boxes, std::vector< int > &proc, std::vector< int > &offset, std::vector< int > &sizes, std::vector< pack_plan_3d< index >> &plans)
Generates an unpack plan where the boxes and the destination do not have the same order.
std::unique_ptr< reshape3d_base< index > > make_reshape3d(typename backend::device_instance< typename backend::buffer_traits< backend_tag >::location >::stream_type stream, std::vector< box3d< index >> const &input_boxes, std::vector< box3d< index >> const &output_boxes, MPI_Comm const comm, plan_options const options)
Factory method to create a reshape3d instance.
Definition: heffte_reshape3d.h:505
std::unique_ptr< reshape3d_alltoall< location_tag, packer, index > > make_reshape3d_alltoall(typename backend::device_instance< location_tag >::stream_type q, std::vector< box3d< index >> const &input_boxes, std::vector< box3d< index >> const &output_boxes, bool uses_gpu_aware, MPI_Comm const comm)
Factory method that all the necessary work to establish the communication patterns.
std::unique_ptr< reshape3d_alltoallv< location_tag, packer, index > > make_reshape3d_alltoallv(typename backend::device_instance< location_tag >::stream_type q, std::vector< box3d< index >> const &input_boxes, std::vector< box3d< index >> const &output_boxes, bool use_gpu_aware, MPI_Comm const comm)
Factory method that all the necessary work to establish the communication patterns.
std::unique_ptr< reshape3d_pointtopoint< location_tag, packer, index > > make_reshape3d_pointtopoint(typename backend::device_instance< location_tag >::stream_type q, std::vector< box3d< index >> const &input_boxes, std::vector< box3d< index >> const &output_boxes, reshape_algorithm algorithm, bool use_gpu_aware, MPI_Comm const comm)
Factory method that all the necessary work to establish the communication patterns.
size_t get_workspace_size(std::array< std::unique_ptr< reshape3d_base< index >>, 4 > const &shapers)
Returns the maximum workspace size used by the shapers.
Definition: heffte_reshape3d.h:115
Namespace containing all HeFFTe methods and classes.
Definition: heffte_backend_cuda.h:38
Common data-transfer operations, must be specializes for each location (cpu/gpu).
Definition: heffte_common.h:59
Holds the auxiliary variables needed by each backend.
Definition: heffte_common.h:358
void * stream_type
The type for the internal stream, the cpu uses just a void pointer.
Definition: heffte_common.h:370
A generic container that describes a 3d box of indexes.
Definition: heffte_geometry.h:67
Defines the direct packer without implementation, use the specializations to get the CPU or GPU imple...
Definition: heffte_pack3d.h:83
Holds the plan for a pack/unpack operation.
Definition: heffte_pack3d.h:32
Defines a set of tweaks and options to use in the plan generation.
Definition: heffte_plan_logic.h:131
reshape_algorithm algorithm
Defines the communication algorithm.
Definition: heffte_plan_logic.h:148
bool use_gpu_aware
Defines whether to use MPI calls directly from the GPU or to move to the CPU first.
Definition: heffte_plan_logic.h:152
Indicates the use of cpu backend and that all input/output data and arrays will be bound to the cpu.
Definition: heffte_common.h:38
Defines the transpose packer without implementation, use the specializations to get the CPU implement...
Definition: heffte_pack3d.h:116