Highly Efficient FFT for Exascale: HeFFTe v2.3
heffte_fft3d_r2c.h
1 /*
2  -- heFFTe --
3  Univ. of Tennessee, Knoxville
4  @date
5 */
6 
7 #ifndef HEFFTE_FFT3D_R2C_H
8 #define HEFFTE_FFT3D_R2C_H
9 
10 #include "heffte_fft3d.h"
11 
12 namespace heffte {
13 
45 template<typename backend_tag, typename index = int>
46 class fft3d_r2c : public backend::device_instance<typename backend::buffer_traits<backend_tag>::location>{
47 public:
56 
60  template<typename T> using buffer_container = typename backend::buffer_traits<backend_tag>::template container<T>;
65 
76  fft3d_r2c(box3d<index> const inbox, box3d<index> const outbox, int r2c_direction, MPI_Comm const comm,
77  plan_options const options = default_options<backend_tag>()) :
78  fft3d_r2c(plan_operations(mpi::gather_boxes(inbox, outbox, comm), r2c_direction, set_options<backend_tag, true>(options), mpi::comm_rank(comm)), comm){
79  assert(r2c_direction == 0 or r2c_direction == 1 or r2c_direction == 2);
80  static_assert(backend::is_enabled<backend_tag>::value, "The requested backend is invalid or has not been enabled.");
81  }
86  box3d<index> const inbox, box3d<index> const outbox, int r2c_direction, MPI_Comm const comm,
87  plan_options const options = default_options<backend_tag>()) :
88  fft3d_r2c(gpu_stream,
89  plan_operations(mpi::gather_boxes(inbox, outbox, comm), r2c_direction, set_options<backend_tag, true>(options), mpi::comm_rank(comm)),
90  comm){
91  assert(r2c_direction == 0 or r2c_direction == 1 or r2c_direction == 2);
92  static_assert(backend::is_enabled<backend_tag>::value, "The requested backend is invalid or has not been enabled.");
93  }
94 
96  fft3d_r2c(int il0, int il1, int il2, int ih0, int ih1, int ih2, int io0, int io1, int io2,
97  int ol0, int ol1, int ol2, int oh0, int oh1, int oh2, int oo0, int oo1, int oo2,
98  int r2c_direction, MPI_Comm const comm,
99  bool use_reorder, int algorithm, bool use_pencils)
100  : fft3d_r2c(box3d<index>({il0, il1, il2}, {ih0, ih1, ih2}, {io0, io1, io2}),
101  box3d<index>({ol0, ol1, ol2}, {oh0, oh1, oh2}, {oo0, oo1, oo2}),
102  r2c_direction, comm,
103  plan_options(use_reorder, static_cast<reshape_algorithm>(algorithm), use_pencils))
104  {}
106  fft3d_r2c(int il0, int il1, int il2, int ih0, int ih1, int ih2, int io0, int io1, int io2,
107  int ol0, int ol1, int ol2, int oh0, int oh1, int oh2, int oo0, int oo1, int oo2,
108  int r2c_direction, MPI_Comm const comm)
109  : fft3d_r2c(box3d<index>({il0, il1, il2}, {ih0, ih1, ih2}, {io0, io1, io2}),
110  box3d<index>({ol0, ol1, ol2}, {oh0, oh1, oh2}, {oo0, oo1, oo2}),
111  r2c_direction, comm)
112  {}
114  fft3d_r2c(int il0, int il1, int il2, int ih0, int ih1, int ih2,
115  int ol0, int ol1, int ol2, int oh0, int oh1, int oh2,
116  int r2c_direction, MPI_Comm const comm)
117  : fft3d_r2c(box3d<index>({il0, il1, il2}, {ih0, ih1, ih2}), box3d<index>({ol0, ol1, ol2}, {oh0, oh1, oh2}), r2c_direction, comm)
118  {}
119 
121  long long size_inbox() const{ return pinbox->count(); }
123  long long size_outbox() const{ return poutbox->count(); }
125  box3d<index> inbox() const{ return *pinbox; }
127  box3d<index> outbox() const{ return *poutbox; }
129  size_t size_workspace() const{ return size_buffer_work; }
131  size_t size_comm_buffers() const{ return comm_buffer_offset; }
132 
146  template<typename input_type, typename output_type>
147  void forward(input_type const input[], output_type output[], scale scaling = scale::none) const{
148  static_assert((std::is_same<input_type, float>::value and is_ccomplex<output_type>::value)
149  or (std::is_same<input_type, double>::value and is_zcomplex<output_type>::value),
150  "Using either an unknown complex type or an incompatible pair of types!");
151 
152  auto workspace = make_buffer_container<output_type>(this->stream(), size_workspace());
153  forward(input, output, workspace.data(), scaling);
154  }
155 
157  template<typename input_type, typename output_type>
158  void forward(input_type const input[], output_type output[], output_type workspace[], scale scaling = scale::none) const{
159  static_assert((std::is_same<input_type, float>::value and is_ccomplex<output_type>::value)
160  or (std::is_same<input_type, double>::value and is_zcomplex<output_type>::value),
161  "Using either an unknown complex type or an incompatible pair of types!");
162 
163  compute_transform<location_tag, index>(this->stream(), 1, convert_to_standard(input), convert_to_standard(output),
164  convert_to_standard(workspace),
165  executor_buffer_offset, size_comm_buffers(), forward_shaper,
166  forward_executors(), direction::forward);
167  apply_scale(1, direction::forward, scaling, output);
168  }
170  template<typename input_type, typename output_type>
171  void forward(int batch_size, input_type const input[], output_type output[],
172  output_type workspace[], scale scaling = scale::none) const{
173  static_assert((std::is_same<input_type, float>::value and is_ccomplex<output_type>::value)
174  or (std::is_same<input_type, double>::value and is_zcomplex<output_type>::value),
175  "Using either an unknown complex type or an incompatible pair of types!");
176 
177  compute_transform<location_tag, index>(this->stream(), batch_size, convert_to_standard(input), convert_to_standard(output),
178  convert_to_standard(workspace),
179  executor_buffer_offset, size_comm_buffers(), forward_shaper,
180  forward_executors(), direction::forward);
181  apply_scale(batch_size, direction::forward, scaling, output);
182  }
184  template<typename input_type, typename output_type>
185  void forward(int batch_size, input_type const input[], output_type output[], scale scaling = scale::none) const{
186  static_assert((std::is_same<input_type, float>::value and is_ccomplex<output_type>::value)
187  or (std::is_same<input_type, double>::value and is_zcomplex<output_type>::value),
188  "Using either an unknown complex type or an incompatible pair of types!");
189 
190  auto workspace = make_buffer_container<output_type>(this->stream(), batch_size * size_workspace());
191  forward(batch_size, input, output, workspace.data(), scaling);
192  }
193 
210  template<typename input_type>
212  if (input.size() < static_cast<size_t>(size_inbox()))
213  throw std::invalid_argument("The input vector is smaller than size_inbox(), i.e., not enough entries provided to fill the inbox.");
214  static_assert(std::is_same<input_type, float>::value or std::is_same<input_type, double>::value,
215  "The input to forward() must be real, i.e., either float or double.");
216  auto output = make_buffer_container<typename fft_output<input_type>::type>(this->stream(), size_outbox());
217  forward(input.data(), output.data(), scaling);
218  return output;
219  }
220 
234  template<typename input_type, typename output_type>
235  void backward(input_type const input[], output_type output[], scale scaling = scale::none) const{
236  static_assert((std::is_same<output_type, float>::value and is_ccomplex<input_type>::value)
237  or (std::is_same<output_type, double>::value and is_zcomplex<input_type>::value),
238  "Using either an unknown complex type or an incompatible pair of types!");
239 
240  auto workspace = make_buffer_container<input_type>(this->stream(), size_workspace());
241  backward(input, output, workspace.data(), scaling);
242  }
243 
245  template<typename input_type, typename output_type>
246  void backward(input_type const input[], output_type output[], input_type workspace[], scale scaling = scale::none) const{
247  static_assert((std::is_same<output_type, float>::value and is_ccomplex<input_type>::value)
248  or (std::is_same<output_type, double>::value and is_zcomplex<input_type>::value),
249  "Using either an unknown complex type or an incompatible pair of types!");
250 
251  compute_transform<location_tag, index>(this->stream(), 1, convert_to_standard(input), convert_to_standard(output),
252  convert_to_standard(workspace),
253  executor_buffer_offset, size_comm_buffers(), backward_shaper,
254  backward_executors(), direction::backward);
255  apply_scale(1, direction::backward, scaling, output);
256  }
258  template<typename input_type, typename output_type>
259  void backward(int batch_size, input_type const input[], output_type output[],
260  input_type workspace[], scale scaling = scale::none) const{
261  static_assert((std::is_same<output_type, float>::value and is_ccomplex<input_type>::value)
262  or (std::is_same<output_type, double>::value and is_zcomplex<input_type>::value),
263  "Using either an unknown complex type or an incompatible pair of types!");
264 
265  compute_transform<location_tag, index>(this->stream(), batch_size, convert_to_standard(input), convert_to_standard(output),
266  convert_to_standard(workspace),
267  executor_buffer_offset, size_comm_buffers(), backward_shaper,
268  backward_executors(), direction::backward);
269  apply_scale(batch_size, direction::backward, scaling, output);
270  }
272  template<typename input_type, typename output_type>
273  void backward(int batch_size, input_type const input[], output_type output[], scale scaling = scale::none) const{
274  static_assert((std::is_same<output_type, float>::value and is_ccomplex<input_type>::value)
275  or (std::is_same<output_type, double>::value and is_zcomplex<input_type>::value),
276  "Using either an unknown complex type or an incompatible pair of types!");
277 
278  auto workspace = make_buffer_container<input_type>(this->stream(), batch_size * size_workspace());
279  backward(batch_size, input, output, workspace.data(), scaling);
280  }
281 
285  template<typename scalar_type>
288  "Either calling backward() with non-complex input or using an unknown complex type.");
289  auto result = make_buffer_container<typename define_standard_type<scalar_type>::type::value_type>(this->stream(), size_inbox());
290  backward(input.data(), result.data(), scaling);
291  return result;
292  }
293 
297  double get_scale_factor(scale scaling) const{ return (scaling == scale::symmetric) ? std::sqrt(scale_factor) : scale_factor; }
298 
299 private:
301  fft3d_r2c(logic_plan3d<index> const &plan, MPI_Comm const comm) :
302  pinbox(new box3d<index>(plan.in_shape[0][plan.mpi_rank])), poutbox(new box3d<index>(plan.out_shape[3][plan.mpi_rank])),
303  scale_factor(1.0 / static_cast<double>(plan.index_count))
304  #ifdef Heffte_ENABLE_MAGMA
305  , hmagma(this->stream())
306  #endif
307  {
308  setup(plan, comm);
309  }
310 
313  logic_plan3d<index> const &plan, MPI_Comm const comm) :
314  backend::device_instance<location_tag>(gpu_stream),
315  pinbox(new box3d<index>(plan.in_shape[0][plan.mpi_rank])), poutbox(new box3d<index>(plan.out_shape[3][plan.mpi_rank])),
316  scale_factor(1.0 / static_cast<double>(plan.index_count))
317  #ifdef Heffte_ENABLE_MAGMA
318  , hmagma(this->stream())
319  #endif
320  {
321  setup(plan, comm);
322  }
323 
325  void setup(logic_plan3d<index> const &plan, MPI_Comm const comm){
326  for(int i=0; i<4; i++){
327  forward_shaper[i] = make_reshape3d<backend_tag>(this->stream(), plan.in_shape[i], plan.out_shape[i], comm, plan.options);
328  backward_shaper[3-i] = make_reshape3d<backend_tag>(this->stream(), plan.out_shape[i], plan.in_shape[i], comm, plan.options);
329  }
330 
331  executors[0] = make_executor_r2c<backend_tag>(this->stream(), plan.out_shape[0][mpi::comm_rank(comm)], plan.fft_direction[0]);
332  executors[1] = make_executor<backend_tag>(this->stream(), plan.out_shape[1][mpi::comm_rank(comm)], plan.fft_direction[1]);
333  executors[2] = make_executor<backend_tag>(this->stream(), plan.out_shape[2][mpi::comm_rank(comm)], plan.fft_direction[2]);
334 
335  size_t executor_workspace_size = get_max_work_size(executors);
336  comm_buffer_offset = std::max(get_workspace_size(forward_shaper), get_workspace_size(backward_shaper));
337  size_buffer_work = comm_buffer_offset
338  + get_max_box_size_r2c(executors) + executor_workspace_size;
339  executor_buffer_offset = (executor_workspace_size == 0) ? 0 : size_buffer_work - executor_workspace_size;
340  }
342  std::array<executor_base*, 3> forward_executors() const{
343  return std::array<executor_base*, 3>{executors[0].get(), executors[1].get(), executors[2].get()};
344  }
346  std::array<executor_base*, 3> backward_executors() const{
347  return std::array<executor_base*, 3>{executors[2].get(), executors[1].get(), executors[0].get()};
348  }
349 
351  template<typename scalar_type>
352  void apply_scale(int const batch_size, direction dir, scale scaling, scalar_type data[]) const{
353  if (scaling != scale::none){
354  add_trace name("scale");
355  #ifdef Heffte_ENABLE_MAGMA
356  if (std::is_same<location_tag, tag::gpu>::value){
357  hmagma.scal(batch_size * (dir == direction::forward) ? size_outbox() : size_inbox(), get_scale_factor(scaling), data);
358  return;
359  }
360  #endif
362  this->stream(),
363  batch_size * ((dir == direction::forward) ? size_outbox() : size_inbox()),
364  data, get_scale_factor(scaling));
365  }
366  }
367 
368  std::unique_ptr<box3d<index>> pinbox, poutbox;
369  double scale_factor;
370  std::array<std::unique_ptr<reshape3d_base<index>>, 4> forward_shaper;
371  std::array<std::unique_ptr<reshape3d_base<index>>, 4> backward_shaper;
372 
373  std::array<std::unique_ptr<executor_base>, 3> executors;
374  #ifdef Heffte_ENABLE_MAGMA
375  gpu::magma_handle<location_tag> hmagma;
376  #endif
377 
378  // cache some values for faster read
379  size_t size_buffer_work, comm_buffer_offset, executor_buffer_offset;
380 };
381 
386 template<typename backend_tag, typename index = int>
388 
393 template<typename backend_tag, typename index>
395  int r2c_direction, MPI_Comm const comm,
396  plan_options const options = default_options<backend_tag>()){
397  static_assert(std::is_same<index, int>::value or std::is_same<index, long long>::value,
398  "heFFTe works with 'int' and 'long long' indexing only");
400  "The backend_tag is not valid, perhaps it needs to be enabled in the build system");
401  return fft3d_r2c<backend_tag, index>(inbox, outbox, r2c_direction, comm, options);
402 }
403 
404 }
405 
406 #endif
Similar to heffte::fft3d, but computed fewer redundant coefficients when the input is real.
Definition: heffte_fft3d_r2c.h:46
buffer_container< typename define_standard_type< T >::type::value_type > real_buffer_container
Container of real values corresponding to the complex type T.
Definition: heffte_fft3d_r2c.h:62
fft3d_r2c(int il0, int il1, int il2, int ih0, int ih1, int ih2, int io0, int io1, int io2, int ol0, int ol1, int ol2, int oh0, int oh1, int oh2, int oo0, int oo1, int oo2, int r2c_direction, MPI_Comm const comm, bool use_reorder, int algorithm, bool use_pencils)
Internal use only, used by the Fortran interface.
Definition: heffte_fft3d_r2c.h:96
void forward(input_type const input[], output_type output[], scale scaling=scale::none) const
Performs a forward Fourier transform using two arrays.
Definition: heffte_fft3d_r2c.h:147
typename backend::buffer_traits< backend_tag >::template container< T > buffer_container
Alias to the container template associated with the backend (allows for RAII memory management).
Definition: heffte_fft3d_r2c.h:60
size_t size_comm_buffers() const
Returns the size used by the communication workspace buffers (internal use).
Definition: heffte_fft3d_r2c.h:131
void backward(int batch_size, input_type const input[], output_type output[], scale scaling=scale::none) const
Overload that performs a batch transform using internally allocated workspace.
Definition: heffte_fft3d_r2c.h:273
typename one_dim_backend< backend_tag >::executor_r2c backend_executor_r2c
FFT executor for the real-to-complex dimension.
Definition: heffte_fft3d_r2c.h:51
fft3d_r2c(typename backend::device_instance< location_tag >::stream_type gpu_stream, box3d< index > const inbox, box3d< index > const outbox, int r2c_direction, MPI_Comm const comm, plan_options const options=default_options< backend_tag >())
See the documentation for fft3d::fft3d()
Definition: heffte_fft3d_r2c.h:85
long long size_inbox() const
Returns the size of the inbox defined in the constructor.
Definition: heffte_fft3d_r2c.h:121
fft3d_r2c(box3d< index > const inbox, box3d< index > const outbox, int r2c_direction, MPI_Comm const comm, plan_options const options=default_options< backend_tag >())
Constructor creating a plan for FFT transform across the given communicator and using the box geometr...
Definition: heffte_fft3d_r2c.h:76
typename backend::buffer_traits< backend_tag >::location location_tag
Type-tag that is either tag::cpu or tag::gpu to indicate the location of the data.
Definition: heffte_fft3d_r2c.h:55
long long size_outbox() const
Returns the size of the outbox defined in the constructor.
Definition: heffte_fft3d_r2c.h:123
buffer_container< typename fft_output< T >::type > output_buffer_container
Container of the output type corresponding to T, see the table of compatible input and output types.
Definition: heffte_fft3d_r2c.h:64
void forward(int batch_size, input_type const input[], output_type output[], output_type workspace[], scale scaling=scale::none) const
Overload utilizing a batch transform.
Definition: heffte_fft3d_r2c.h:171
real_buffer_container< scalar_type > backward(buffer_container< scalar_type > const &input, scale scaling=scale::none)
Variant of backward() that uses buffer_container for RAII style of resource management.
Definition: heffte_fft3d_r2c.h:286
fft3d_r2c(int il0, int il1, int il2, int ih0, int ih1, int ih2, int ol0, int ol1, int ol2, int oh0, int oh1, int oh2, int r2c_direction, MPI_Comm const comm)
Internal use only, used by the Fortran interface.
Definition: heffte_fft3d_r2c.h:114
void backward(int batch_size, input_type const input[], output_type output[], input_type workspace[], scale scaling=scale::none) const
Overload that performs a batch transform.
Definition: heffte_fft3d_r2c.h:259
box3d< index > inbox() const
Returns the inbox.
Definition: heffte_fft3d_r2c.h:125
box3d< index > outbox() const
Returns the outbox.
Definition: heffte_fft3d_r2c.h:127
typename one_dim_backend< backend_tag >::executor backend_executor_c2c
FFT executor for the complex-to-complex dimensions.
Definition: heffte_fft3d_r2c.h:49
void forward(input_type const input[], output_type output[], output_type workspace[], scale scaling=scale::none) const
Overload utilizing a user provided buffer.
Definition: heffte_fft3d_r2c.h:158
size_t size_workspace() const
Returns the workspace size that will be used, size is measured in complex numbers.
Definition: heffte_fft3d_r2c.h:129
fft3d_r2c(int il0, int il1, int il2, int ih0, int ih1, int ih2, int io0, int io1, int io2, int ol0, int ol1, int ol2, int oh0, int oh1, int oh2, int oo0, int oo1, int oo2, int r2c_direction, MPI_Comm const comm)
Internal use only, used by the Fortran interface.
Definition: heffte_fft3d_r2c.h:106
void backward(input_type const input[], output_type output[], input_type workspace[], scale scaling=scale::none) const
Overload utilizing a user provided buffer.
Definition: heffte_fft3d_r2c.h:246
void forward(int batch_size, input_type const input[], output_type output[], scale scaling=scale::none) const
Overload utilizing a batch transform using internally allocated workspace.
Definition: heffte_fft3d_r2c.h:185
double get_scale_factor(scale scaling) const
Returns the scale factor for the given scaling.
Definition: heffte_fft3d_r2c.h:297
output_buffer_container< input_type > forward(buffer_container< input_type > const &input, scale scaling=scale::none)
Vector variant of forward() using input and output buffer_container classes.
Definition: heffte_fft3d_r2c.h:211
void backward(input_type const input[], output_type output[], scale scaling=scale::none) const
Performs a backward Fourier transform using two arrays.
Definition: heffte_fft3d_r2c.h:235
reshape_algorithm
Defines list of potential communication algorithms.
Definition: heffte_plan_logic.h:48
plan_options set_options(plan_options opts)
Adjusts the user provided options to what can be handled by the backend.
Definition: heffte_plan_logic.h:207
fft3d_r2c< backend_tag, index > make_fft3d_r2c(box3d< index > const inbox, box3d< index > const outbox, int r2c_direction, MPI_Comm const comm, plan_options const options=default_options< backend_tag >())
Factory method that auto-detects the index type based on the box.
Definition: heffte_fft3d_r2c.h:394
scale
Indicates the scaling factor to apply on the result of an FFT operation.
Definition: heffte_fft3d.h:113
@ none
No scale, leave the result unperturbed similar to the FFTW API.
@ symmetric
Symmetric scaling, apply the square-root of the full scaling.
std::string name()
Returns the human readable name of the backend.
Definition: heffte_common.h:240
direction
Indicates the direction of the FFT (internal use only).
Definition: heffte_common.h:535
size_t get_max_box_size_r2c(std::array< some_class, 3 > const &executors)
Returns the max of the box_size() for each of the executors.
Definition: heffte_utils.h:413
size_t get_max_work_size(std::array< some_class, 3 > const &executors)
Returns the max of the workspace_size() for each of the executors.
Definition: heffte_utils.h:423
@ backward
Inverse DFT transform.
@ forward
Forward DFT transform.
logic_plan3d< index > plan_operations(ioboxes< index > const &boxes, int r2c_direction, plan_options const options, int const mpi_rank)
Creates the logic plan with the provided user input.
Definition: heffte_plan_logic.cpp:421
void apply(cudaStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Simply multiply the num_entries in the data by the scale_factor.
Definition: heffte_backend_cuda.h:796
int comm_rank(MPI_Comm const comm)
Returns the rank of this process within the specified comm.
Definition: heffte_utils.h:78
ioboxes< index > gather_boxes(box3d< index > const my_inbox, box3d< index > const my_outbox, MPI_Comm const comm)
Gather all boxes across all ranks in the comm.
Definition: heffte_geometry.h:697
size_t get_workspace_size(std::array< std::unique_ptr< reshape3d_base< index >>, 4 > const &shapers)
Returns the maximum workspace size used by the shapers.
Definition: heffte_reshape3d.h:115
Namespace containing all HeFFTe methods and classes.
Definition: heffte_backend_cuda.h:38
define_standard_type< scalar_type >::type * convert_to_standard(scalar_type input[])
Converts an array of some type to an array of the C++ equivalent type.
Definition: heffte_utils.h:354
Defines the container for the temporary buffers.
Definition: heffte_common.h:212
Holds the auxiliary variables needed by each backend.
Definition: heffte_common.h:358
void * stream_type
The type for the internal stream, the cpu uses just a void pointer.
Definition: heffte_common.h:370
void * stream()
Returns the nullptr.
Definition: heffte_common.h:364
device_instance(void *=nullptr)
Empty constructor.
Definition: heffte_common.h:360
Allows to define whether a specific backend interface has been enabled.
Definition: heffte_common.h:201
A generic container that describes a 3d box of indexes.
Definition: heffte_geometry.h:67
Struct to specialize to allow HeFFTe to recognize custom single precision complex types.
Definition: heffte_utils.h:251
Struct to specialize to allow HeFFTe to recognize custom double precision complex types.
Definition: heffte_utils.h:269
The logic plan incorporates the order and types of operations in a transform.
Definition: heffte_plan_logic.h:275
Indicates the structure that will be used by the fft backend.
Definition: heffte_common.h:546
Defines a set of tweaks and options to use in the plan generation.
Definition: heffte_plan_logic.h:131
Indicates the use of cpu backend and that all input/output data and arrays will be bound to the cpu.
Definition: heffte_common.h:38