7 #ifndef HEFFTE_RESHAPE3D_H
8 #define HEFFTE_RESHAPE3D_H
10 #include "heffte_plan_logic.h"
11 #include "heffte_backends.h"
31 #ifdef Heffte_ENABLE_CUDA
32 namespace gpu {
using namespace cuda; }
34 #ifdef Heffte_ENABLE_ROCM
35 namespace gpu {
using namespace rocm; }
37 #ifdef Heffte_ENABLE_ONEAPI
38 namespace gpu {
using namespace oneapi; }
51 template<
typename index>
53 std::vector<int> &proc, std::vector<int> &offset, std::vector<int> &sizes, std::vector<
pack_plan_3d<index>> &plans);
59 template<
typename index>
63 reshape3d_base(index cinput_size, index coutput_size) : input_size(cinput_size), output_size(coutput_size){};
67 virtual void apply(
int batch_size,
float const source[],
float destination[],
float workspace[])
const = 0;
69 virtual void apply(
int batch_size,
double const source[],
double destination[],
double workspace[])
const = 0;
71 virtual void apply(
int batch_size, std::complex<float>
const source[], std::complex<float> destination[], std::complex<float> workspace[])
const = 0;
73 virtual void apply(
int batch_size, std::complex<double>
const source[], std::complex<double> destination[], std::complex<double> workspace[])
const = 0;
93 template<
typename scalar_type> scalar_type*
cpu_send_buffer(
size_t num_entries)
const{
94 size_t float_entries = num_entries *
sizeof(scalar_type) /
sizeof(
float);
95 send_unaware.resize(float_entries);
96 return reinterpret_cast<scalar_type*
>(send_unaware.data());
99 template<
typename scalar_type> scalar_type*
cpu_recv_buffer(
size_t num_entries)
const{
100 size_t float_entries = num_entries *
sizeof(scalar_type) /
sizeof(
float);
101 recv_unaware.resize(float_entries);
102 return reinterpret_cast<scalar_type*
>(recv_unaware.data());
114 template<
typename index>
117 for(
auto const &s : shapers)
if (s) max_size = std::max(max_size, s->size_workspace());
132 template<
typename location_tag,
template<
typename device>
class packer,
typename index>
138 template<
typename b,
template<
typename d>
class p,
typename i>
friend std::unique_ptr<reshape3d_alltoall<b, p, i>>
142 void apply(
int batch_size,
float const source[],
float destination[],
float workspace[])
const override final{
143 apply_base(batch_size, source, destination, workspace);
146 void apply(
int batch_size,
double const source[],
double destination[],
double workspace[])
const override final{
147 apply_base(batch_size, source, destination, workspace);
150 void apply(
int batch_size, std::complex<float>
const source[], std::complex<float> destination[], std::complex<float> workspace[])
const override final{
151 apply_base(batch_size, source, destination, workspace);
154 void apply(
int batch_size, std::complex<double>
const source[], std::complex<double> destination[], std::complex<double> workspace[])
const override final{
155 apply_base(batch_size, source, destination, workspace);
159 template<
typename scalar_type>
160 void apply_base(
int batch_size, scalar_type
const source[], scalar_type destination[], scalar_type workspace[])
const;
163 size_t size_workspace()
const override {
return 2 * num_entries * packplan.size(); }
170 int input_size,
int output_size,
bool gpu_aware, MPI_Comm ccomm,
172 std::vector<int>&&, std::vector<int>&&,
int);
175 int const me, nprocs;
176 bool const use_gpu_aware;
178 std::vector<pack_plan_3d<index>> packplan, unpackplan;
179 std::vector<int> send_offset, recv_offset;
180 int const num_entries;
205 template<
typename location_tag,
template<
typename device>
class packer = direct_packer,
typename index>
206 std::unique_ptr<reshape3d_alltoall<location_tag, packer, index>>
209 bool uses_gpu_aware, MPI_Comm
const comm);
225 template<
typename location_tag,
template<
typename device>
class packer,
typename index>
231 template<
typename b,
template<
typename d>
class p,
typename i>
friend std::unique_ptr<reshape3d_alltoallv<b, p, i>>
235 void apply(
int batch_size,
float const source[],
float destination[],
float workspace[])
const override final{
236 apply_base(batch_size, source, destination, workspace);
239 void apply(
int batch_size,
double const source[],
double destination[],
double workspace[])
const override final{
240 apply_base(batch_size, source, destination, workspace);
243 void apply(
int batch_size, std::complex<float>
const source[], std::complex<float> destination[], std::complex<float> workspace[])
const override final{
244 apply_base(batch_size, source, destination, workspace);
247 void apply(
int batch_size, std::complex<double>
const source[], std::complex<double> destination[], std::complex<double> workspace[])
const override final{
248 apply_base(batch_size, source, destination, workspace);
252 template<
typename scalar_type>
253 void apply_base(
int batch_size, scalar_type
const source[], scalar_type destination[], scalar_type workspace[])
const;
260 int input_size,
int output_size,
261 bool gpu_aware, MPI_Comm new_comm, std::vector<int>
const &pgroup,
262 std::vector<int> &&send_offset, std::vector<int> &&send_size, std::vector<int>
const &send_proc,
263 std::vector<int> &&recv_offset, std::vector<int> &&recv_size, std::vector<int>
const &recv_proc,
267 int const me, nprocs;
268 bool const use_gpu_aware;
270 std::vector<int>
const send_offset;
271 std::vector<int>
const send_size;
272 std::vector<int>
const recv_offset;
273 std::vector<int>
const recv_size;
274 int const send_total, recv_total;
276 std::vector<pack_plan_3d<index>>
const packplan, unpackplan;
279 std::vector<int> counts, displacements, map;
280 iotripple(std::vector<int>
const &pgroup, std::vector<int>
const &proc, std::vector<int>
const &sizes) :
281 counts(pgroup.size(), 0), displacements(pgroup.size(), 0), map(pgroup.size(), -1)
284 for(
size_t src = 0; src < pgroup.size(); src++){
285 for(
size_t i=0; i<proc.size(); i++){
286 if (proc[i] != pgroup[src])
continue;
287 counts[src] = sizes[i];
288 displacements[src] = offset;
297 iotripple
const send, recv;
322 template<
typename location_tag,
template<
typename device>
class packer = direct_packer,
typename index>
323 std::unique_ptr<reshape3d_alltoallv<location_tag, packer, index>>
328 MPI_Comm
const comm);
340 template<
typename location_tag,
template<
typename device>
class packer,
typename index>
346 template<
typename b,
template<
typename d>
class p,
typename i>
friend std::unique_ptr<reshape3d_pointtopoint<b, p, i>>
350 void apply(
int batch_size,
float const source[],
float destination[],
float workspace[])
const override final{
351 apply_base(batch_size, source, destination, workspace);
354 void apply(
int batch_size,
double const source[],
double destination[],
double workspace[])
const override final{
355 apply_base(batch_size, source, destination, workspace);
358 void apply(
int batch_size, std::complex<float>
const source[], std::complex<float> destination[], std::complex<float> workspace[])
const override final{
359 apply_base(batch_size, source, destination, workspace);
362 void apply(
int batch_size, std::complex<double>
const source[], std::complex<double> destination[], std::complex<double> workspace[])
const override final{
363 apply_base(batch_size, source, destination, workspace);
367 template<
typename scalar_type>
368 void apply_base(
int batch_size, scalar_type
const source[], scalar_type destination[], scalar_type workspace[])
const;
371 template<
typename scalar_type>
372 void no_gpuaware_send_recv(
int batch_size, scalar_type
const source[], scalar_type destination[], scalar_type workspace[])
const;
379 int input_size,
int output_size,
reshape_algorithm alg,
bool gpu_aware, MPI_Comm ccomm,
380 std::vector<int> &&send_offset, std::vector<int> &&send_size, std::vector<int> &&send_proc,
381 std::vector<int> &&recv_offset, std::vector<int> &&recv_size, std::vector<int> &&recv_proc,
382 std::vector<int> &&recv_loc,
386 int const me, nprocs;
387 bool const self_to_self;
389 bool const use_gpu_aware;
390 mutable std::vector<MPI_Request> requests;
391 mutable std::vector<MPI_Request> isends;
393 std::vector<int>
const send_proc;
394 std::vector<int>
const send_offset;
395 std::vector<int>
const send_size;
396 std::vector<int>
const recv_proc;
397 std::vector<int>
const recv_offset;
398 std::vector<int>
const recv_size;
399 std::vector<int>
const recv_loc;
400 int const send_total, recv_total;
402 std::vector<pack_plan_3d<index>>
const packplan, unpackplan;
429 template<
typename location_tag,
template<
typename device>
class packer =
direct_packer,
typename index>
430 std::unique_ptr<reshape3d_pointtopoint<location_tag, packer, index>>
435 MPI_Comm
const comm);
443 template<
typename location_tag,
typename index>
449 reshape3d_base<index>(cplan.size[0] * cplan.size[1] * cplan.size[2], cplan.size[0] * cplan.size[1] * cplan.size[2]),
450 backend::device_instance<location_tag>(q),
455 void apply(
int batch_size,
float const source[],
float destination[],
float workspace[])
const override final{
456 transpose(batch_size, source, destination, workspace);
459 void apply(
int batch_size,
double const source[],
double destination[],
double workspace[])
const override final{
460 transpose(batch_size, source, destination, workspace);
463 void apply(
int batch_size, std::complex<float>
const source[], std::complex<float> destination[], std::complex<float> workspace[])
const override final{
464 transpose(batch_size, source, destination, workspace);
467 void apply(
int batch_size, std::complex<double>
const source[], std::complex<double> destination[], std::complex<double> workspace[])
const override final{
468 transpose(batch_size, source, destination, workspace);
472 template<
typename scalar_type>
473 void transpose(
int batch_size, scalar_type
const *source, scalar_type *destination, scalar_type *workspace)
const{
474 if (source == destination){
476 for(
int j=0; j<batch_size; j++)
478 destination + j * this->input_size);
480 for(
int j=0; j<batch_size; j++)
481 transpose_packer<location_tag>().unpack(this->stream(), plan, source + j * this->input_size,
482 destination + j * this->input_size);
486 pack_plan_3d<index>
const plan;
504 template<
typename backend_tag,
typename index>
512 if (
match(input_boxes, output_boxes)){
513 if (input_boxes[0].ordered_same_as(output_boxes[0])){
514 return std::unique_ptr<reshape3d_base<index>>();
517 std::vector<int> proc, offset, sizes;
518 std::vector<pack_plan_3d<index>> plans;
522 if (not plans.empty()){
526 return std::unique_ptr<reshape3d_base<index>>();
531 if (input_boxes[0].ordered_same_as(output_boxes[0])){
532 return make_reshape3d_alltoallv<location_tag, direct_packer, index>(stream, input_boxes, output_boxes,
535 return make_reshape3d_alltoallv<location_tag, transpose_packer, index>(stream, input_boxes, output_boxes,
539 if (input_boxes[0].ordered_same_as(output_boxes[0])){
540 return make_reshape3d_alltoall<location_tag, direct_packer, index>(stream, input_boxes, output_boxes,
543 return make_reshape3d_alltoall<location_tag, transpose_packer, index>(stream, input_boxes, output_boxes,
547 if (input_boxes[0].ordered_same_as(output_boxes[0])){
548 return make_reshape3d_pointtopoint<location_tag, direct_packer, index>(stream, input_boxes, output_boxes,
551 return make_reshape3d_pointtopoint<location_tag, transpose_packer, index>(stream, input_boxes, output_boxes,
Reshape algorithm based on the MPI_Alltoall() method.
Definition: heffte_reshape3d.h:133
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition: heffte_reshape3d.h:154
~reshape3d_alltoall()
Destructor, frees the comm generated by the constructor.
Definition: heffte_reshape3d.h:136
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition: heffte_reshape3d.h:150
size_t size_workspace() const override
The size of the workspace must include padding.
Definition: heffte_reshape3d.h:163
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition: heffte_reshape3d.h:142
void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_alltoallv::apply() algorithm for all scalar types.
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition: heffte_reshape3d.h:146
friend std::unique_ptr< reshape3d_alltoall< b, p, i > > make_reshape3d_alltoall(typename backend::device_instance< b >::stream_type, std::vector< box3d< i >> const &, std::vector< box3d< i >> const &, bool, MPI_Comm const)
Factory method, use to construct instances of the class.
Reshape algorithm based on the MPI_Alltoallv() method.
Definition: heffte_reshape3d.h:226
friend std::unique_ptr< reshape3d_alltoallv< b, p, i > > make_reshape3d_alltoallv(typename backend::device_instance< b >::stream_type, std::vector< box3d< i >> const &, std::vector< box3d< i >> const &, bool, MPI_Comm const)
Factory method, use to construct instances of the class.
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition: heffte_reshape3d.h:247
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition: heffte_reshape3d.h:243
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition: heffte_reshape3d.h:239
~reshape3d_alltoallv()
Destructor, frees the comm generated by the constructor.
Definition: heffte_reshape3d.h:229
void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_alltoallv::apply() algorithm for all scalar types.
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition: heffte_reshape3d.h:235
Base reshape interface.
Definition: heffte_reshape3d.h:60
scalar_type * cpu_send_buffer(size_t num_entries) const
Allocates and returns a CPU buffer when GPU-Aware communication has been disabled.
Definition: heffte_reshape3d.h:93
reshape3d_base(index cinput_size, index coutput_size)
Constructor that sets the input and output sizes.
Definition: heffte_reshape3d.h:63
scalar_type * cpu_recv_buffer(size_t num_entries) const
Allocates and returns a CPU buffer when GPU-Aware communication has been disabled.
Definition: heffte_reshape3d.h:99
index const output_size
Stores the size of the output.
Definition: heffte_reshape3d.h:86
virtual void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const =0
Apply the reshape, double precision complex.
virtual ~reshape3d_base()=default
Default virtual destructor.
index size_intput() const
Returns the input size.
Definition: heffte_reshape3d.h:76
virtual void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const =0
Apply the reshape, single precision complex.
virtual void apply(int batch_size, double const source[], double destination[], double workspace[]) const =0
Apply the reshape, double precision.
index const input_size
Stores the size of the input.
Definition: heffte_reshape3d.h:84
index size_output() const
Returns the output size.
Definition: heffte_reshape3d.h:78
virtual void apply(int batch_size, float const source[], float destination[], float workspace[]) const =0
Apply the reshape, single precision.
std::vector< float > send_unaware
Temp buffers for the gpu-unaware algorithms.
Definition: heffte_reshape3d.h:105
virtual size_t size_workspace() const
Returns the workspace size.
Definition: heffte_reshape3d.h:80
std::vector< float > recv_unaware
Temp buffers for the gpu-unaware algorithms.
Definition: heffte_reshape3d.h:107
Reshape algorithm based on the MPI_Send() and MPI_Irecv() methods.
Definition: heffte_reshape3d.h:341
~reshape3d_pointtopoint()=default
Destructor, frees the comm generated by the constructor.
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition: heffte_reshape3d.h:350
friend std::unique_ptr< reshape3d_pointtopoint< b, p, i > > make_reshape3d_pointtopoint(typename backend::device_instance< b >::stream_type, std::vector< box3d< i >> const &, std::vector< box3d< i >> const &, reshape_algorithm, bool, MPI_Comm const)
Factory method, use to construct instances of the class.
void no_gpuaware_send_recv(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_pointtopoint::apply() algorithm that does not use GPU-Aware MPI.
void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_pointtopoint::apply() algorithm for all scalar types.
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition: heffte_reshape3d.h:358
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition: heffte_reshape3d.h:362
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition: heffte_reshape3d.h:354
Special case of the reshape that does not involve MPI communication but applies a transpose instead.
Definition: heffte_reshape3d.h:444
reshape3d_transpose(typename backend::device_instance< location_tag >::stream_type q, pack_plan_3d< index > const cplan)
Constructor using the provided unpack plan.
Definition: heffte_reshape3d.h:447
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition: heffte_reshape3d.h:467
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition: heffte_reshape3d.h:455
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition: heffte_reshape3d.h:463
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition: heffte_reshape3d.h:459
reshape_algorithm
Defines list of potential communication algorithms.
Definition: heffte_plan_logic.h:48
@ alltoall
Using the MPI_Alltoall options, with padding on the data.
@ alltoallv
Using the MPI_Alltoallv options, no padding on the data (default option).
bool match(std::vector< box3d< index >> const &shape0, std::vector< box3d< index >> const &shape1)
Compares two vectors of boxes, returns true if all boxes match.
Definition: heffte_geometry.h:246
int comm_rank(MPI_Comm const comm)
Returns the rank of this process within the specified comm.
Definition: heffte_utils.h:78
void comm_free(MPI_Comm const comm)
Calls free on the MPI comm.
Definition: heffte_utils.h:174
void compute_overlap_map_transpose_pack(int me, int nprocs, box3d< index > const destination, std::vector< box3d< index >> const &boxes, std::vector< int > &proc, std::vector< int > &offset, std::vector< int > &sizes, std::vector< pack_plan_3d< index >> &plans)
Generates an unpack plan where the boxes and the destination do not have the same order.
std::unique_ptr< reshape3d_base< index > > make_reshape3d(typename backend::device_instance< typename backend::buffer_traits< backend_tag >::location >::stream_type stream, std::vector< box3d< index >> const &input_boxes, std::vector< box3d< index >> const &output_boxes, MPI_Comm const comm, plan_options const options)
Factory method to create a reshape3d instance.
Definition: heffte_reshape3d.h:505
std::unique_ptr< reshape3d_alltoall< location_tag, packer, index > > make_reshape3d_alltoall(typename backend::device_instance< location_tag >::stream_type q, std::vector< box3d< index >> const &input_boxes, std::vector< box3d< index >> const &output_boxes, bool uses_gpu_aware, MPI_Comm const comm)
Factory method that all the necessary work to establish the communication patterns.
std::unique_ptr< reshape3d_alltoallv< location_tag, packer, index > > make_reshape3d_alltoallv(typename backend::device_instance< location_tag >::stream_type q, std::vector< box3d< index >> const &input_boxes, std::vector< box3d< index >> const &output_boxes, bool use_gpu_aware, MPI_Comm const comm)
Factory method that all the necessary work to establish the communication patterns.
std::unique_ptr< reshape3d_pointtopoint< location_tag, packer, index > > make_reshape3d_pointtopoint(typename backend::device_instance< location_tag >::stream_type q, std::vector< box3d< index >> const &input_boxes, std::vector< box3d< index >> const &output_boxes, reshape_algorithm algorithm, bool use_gpu_aware, MPI_Comm const comm)
Factory method that all the necessary work to establish the communication patterns.
size_t get_workspace_size(std::array< std::unique_ptr< reshape3d_base< index >>, 4 > const &shapers)
Returns the maximum workspace size used by the shapers.
Definition: heffte_reshape3d.h:115
Namespace containing all HeFFTe methods and classes.
Definition: heffte_backend_cuda.h:38
Common data-transfer operations, must be specializes for each location (cpu/gpu).
Definition: heffte_common.h:59
Holds the auxiliary variables needed by each backend.
Definition: heffte_common.h:358
void * stream_type
The type for the internal stream, the cpu uses just a void pointer.
Definition: heffte_common.h:370
A generic container that describes a 3d box of indexes.
Definition: heffte_geometry.h:67
Defines the direct packer without implementation, use the specializations to get the CPU or GPU imple...
Definition: heffte_pack3d.h:83
Holds the plan for a pack/unpack operation.
Definition: heffte_pack3d.h:32
Defines a set of tweaks and options to use in the plan generation.
Definition: heffte_plan_logic.h:131
reshape_algorithm algorithm
Defines the communication algorithm.
Definition: heffte_plan_logic.h:148
bool use_gpu_aware
Defines whether to use MPI calls directly from the GPU or to move to the CPU first.
Definition: heffte_plan_logic.h:152
Indicates the use of cpu backend and that all input/output data and arrays will be bound to the cpu.
Definition: heffte_common.h:38
Defines the transpose packer without implementation, use the specializations to get the CPU implement...
Definition: heffte_pack3d.h:116