Highly Efficient FFT for Exascale: HeFFTe v2.3
heffte_backend_cuda.h
1 /*
2  -- heFFTe --
3  Univ. of Tennessee, Knoxville
4  @date
5 */
6 
7 #ifndef HEFFTE_BACKEND_CUDA_H
8 #define HEFFTE_BACKEND_CUDA_H
9 
10 #include "heffte_r2r_executor.h"
11 
12 #ifdef Heffte_ENABLE_CUDA
13 
14 #include <cuda_runtime_api.h>
15 #include <cuda.h>
16 #include <cufft.h>
17 #include "heffte_backend_vector.h"
18 
19 #ifdef Heffte_ENABLE_MAGMA
20 #include "heffte_magma_helpers.h"
21 #endif
22 
38 namespace heffte{
39 
44 namespace cuda {
49  inline void check_error(cudaError_t status, const char *function_name){
50  if (status != cudaSuccess)
51  throw std::runtime_error(std::string(function_name) + " failed with message: " + cudaGetErrorString(status));
52  }
57  inline void check_error(cufftResult status, const char *function_name){
58  if (status != CUFFT_SUCCESS)
59  throw std::runtime_error(std::string(function_name) + " failed with error code: " + std::to_string(status));
60  }
67  template<typename precision_type, typename index>
68  void convert(cudaStream_t stream, index num_entries, precision_type const source[], std::complex<precision_type> destination[]);
75  template<typename precision_type, typename index>
76  void convert(cudaStream_t stream, index num_entries, std::complex<precision_type> const source[], precision_type destination[]);
77 
82  template<typename scalar_type, typename index>
83  void scale_data(cudaStream_t stream, index num_entries, scalar_type *data, double scale_factor);
84 
91  template<typename scalar_type, typename index>
92  void direct_pack(cudaStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[]);
99  template<typename scalar_type, typename index>
100  void direct_unpack(cudaStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[]);
107  template<typename scalar_type, typename index>
108  void transpose_unpack(cudaStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide,
109  index buff_line_stride, index buff_plane_stride, int map0, int map1, int map2,
110  scalar_type const source[], scalar_type destination[]);
111 
118  template<typename precision>
119  static void pre_forward(cudaStream_t, int length, precision const input[], precision fft_signal[]);
121  template<typename precision>
122  static void post_forward(cudaStream_t, int length, std::complex<precision> const fft_result[], precision result[]);
124  template<typename precision>
125  static void pre_backward(cudaStream_t, int length, precision const input[], std::complex<precision> fft_signal[]);
127  template<typename precision>
128  static void post_backward(cudaStream_t, int length, precision const fft_result[], precision result[]);
129  };
136  template<typename precision>
137  static void pre_forward(cudaStream_t, int length, precision const input[], precision fft_signal[]);
139  template<typename precision>
140  static void post_forward(cudaStream_t, int length, std::complex<precision> const fft_result[], precision result[]);
142  template<typename precision>
143  static void pre_backward(cudaStream_t, int length, precision const input[], std::complex<precision> fft_signal[]);
145  template<typename precision>
146  static void post_backward(cudaStream_t, int length, precision const fft_result[], precision result[]);
147  };
148 }
149 
150 namespace backend{
155  template<> struct is_enabled<cufft> : std::true_type{};
160  template<> struct is_enabled<cufft_cos> : std::true_type{};
165  template<> struct is_enabled<cufft_sin> : std::true_type{};
166 
171  template<>
172  struct device_instance<tag::gpu>{
174  device_instance(cudaStream_t new_stream = nullptr) : _stream(new_stream){}
176  cudaStream_t stream(){ return _stream; }
178  cudaStream_t stream() const{ return _stream; }
180  void synchronize_device() const{ cuda::check_error(cudaStreamSynchronize(_stream), "device sync"); }
182  mutable cudaStream_t _stream;
184  using stream_type = cudaStream_t;
185  };
186 
191  template<> struct default_backend<tag::gpu>{
193  using type = cufft;
194  };
195 
200  template<> struct data_manipulator<tag::gpu> {
202  using stream_type = cudaStream_t;
206  template<typename scalar_type>
207  static scalar_type* allocate(cudaStream_t stream, size_t num_entries){
208  void *new_data;
209  if (stream != nullptr) cuda::check_error( cudaStreamSynchronize(stream), "cudaStreamSynchronize()");
210  cuda::check_error(cudaMalloc(&new_data, num_entries * sizeof(scalar_type)), "cudaMalloc()");
211  return reinterpret_cast<scalar_type*>(new_data);
212  }
214  template<typename scalar_type>
215  static void free(cudaStream_t stream, scalar_type *pntr){
216  if (pntr == nullptr) return;
217  if (stream != nullptr) cuda::check_error( cudaStreamSynchronize(stream), "cudaStreamSynchronize()");
218  cuda::check_error(cudaFree(pntr), "cudaFree()");
219  }
221  template<typename scalar_type>
222  static void copy_n(cudaStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
223  if (stream == nullptr)
224  cuda::check_error(cudaMemcpy(destination, source, num_entries * sizeof(scalar_type), cudaMemcpyDeviceToDevice), "data_manipulator::copy_n()");
225  else
226  cuda::check_error(cudaMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), cudaMemcpyDeviceToDevice, stream), "data_manipulator::copy_n()");
227  }
229  template<typename scalar_type>
230  static void copy_n(cudaStream_t stream, std::complex<scalar_type> const source[], size_t num_entries, scalar_type destination[]){
231  cuda::convert(stream, static_cast<long long>(num_entries), source, destination);
232  }
234  template<typename scalar_type>
235  static void copy_n(cudaStream_t stream, scalar_type const source[], size_t num_entries, std::complex<scalar_type> destination[]){
236  cuda::convert(stream, static_cast<long long>(num_entries), source, destination);
237  }
239  template<typename scalar_type>
240  static void copy_device_to_host(cudaStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
241  cuda::check_error(cudaMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), cudaMemcpyDeviceToHost, stream),
242  "device_to_host (cuda)");
243  }
245  template<typename scalar_type>
246  static void copy_device_to_device(cudaStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
247  cuda::check_error(cudaMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), cudaMemcpyDeviceToDevice, stream),
248  "device_to_device (cuda)");
249  }
251  template<typename scalar_type>
252  static void copy_host_to_device(cudaStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
253  cuda::check_error(cudaMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), cudaMemcpyHostToDevice, stream),
254  "host_to_device (cuda)");
255  }
256  };
257 
262  template<>
267  template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
268  };
273  template<>
278  template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
279  };
284  template<>
289  template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
290  };
291 }
292 
297 template<> struct is_ccomplex<cufftComplex> : std::true_type{};
302 template<> struct is_zcomplex<cufftDoubleComplex> : std::true_type{};
303 
310 template<typename scalar_type> struct plan_cufft{
320  plan_cufft(cudaStream_t stream, int size, int batch, int stride, int dist){
321  size_t work_size = 0;
322  cuda::check_error(cufftCreate(&plan), "plan_cufft::cufftCreate()");
324  cufftMakePlanMany(plan, 1, &size, &size, stride, dist, &size, stride, dist,
325  (std::is_same<scalar_type, std::complex<float>>::value) ? CUFFT_C2C : CUFFT_Z2Z,
326  batch, &work_size),
327  "plan_cufft::cufftMakePlanMany() 1D"
328  );
329 
330  if (stream != nullptr)
331  cuda::check_error( cufftSetStream(plan, stream), "cufftSetStream()");
332  }
344  plan_cufft(cudaStream_t stream, int size1, int size2, std::array<int, 2> const &embed, int batch, int stride, int dist){
345  size_t work_size = 0;
346  int size[2] = {size2, size1};
347 
348  cuda::check_error(cufftCreate(&plan), "plan_cufft::cufftCreate()");
349  if (embed[0] == 0 and embed[1] == 0){
351  cufftMakePlanMany(plan, 2, size, size, stride, dist, size, stride, dist,
352  (std::is_same<scalar_type, std::complex<float>>::value) ? CUFFT_C2C : CUFFT_Z2Z,
353  batch, &work_size),
354  "plan_cufft::cufftMakePlanMany() 2D"
355  );
356  }else{
358  cufftMakePlanMany(plan, 2, size, const_cast<int*>(embed.data()), stride, dist, const_cast<int*>(embed.data()), stride, dist,
359  (std::is_same<scalar_type, std::complex<float>>::value) ? CUFFT_C2C : CUFFT_Z2Z,
360  batch, &work_size),
361  "plan_cufft::cufftMakePlanMany() 2D with embedding"
362  );
363  }
364 
365  if (stream != nullptr)
366  cuda::check_error( cufftSetStream(plan, stream), "cufftSetStream()");
367  }
369  plan_cufft(cudaStream_t stream, int size1, int size2, int size3){
371  cufftPlan3d(&plan, size3, size2, size1,
372  (std::is_same<scalar_type, std::complex<float>>::value) ? CUFFT_C2C : CUFFT_Z2Z),
373  "plan_cufft::cufftPlan3d()"
374  );
375  if (stream != nullptr)
376  cuda::check_error( cufftSetStream(plan, stream), "cufftSetStream()");
377  }
379  ~plan_cufft(){ cufftDestroy(plan); }
381  operator cufftHandle() const{ return plan; }
382 
383 private:
385  cufftHandle plan;
386 };
387 
401 public:
409  template<typename index>
410  cufft_executor(cudaStream_t active_stream, box3d<index> const box, int dimension) :
411  stream(active_stream),
412  size(box.size[dimension]), size2(0),
413  howmanyffts(fft1d_get_howmany(box, dimension)),
414  stride(fft1d_get_stride(box, dimension)),
415  dist((dimension == box.order[0]) ? size : 1),
416  blocks((dimension == box.order[1]) ? box.osize(2) : 1),
417  block_stride(box.osize(0) * box.osize(1)),
418  total_size(box.count()),
419  embed({0, 0})
420  {}
422  template<typename index>
423  cufft_executor(cudaStream_t active_stream, box3d<index> const box, int dir1, int dir2) :
424  stream(active_stream),
425  size(box.size[std::min(dir1, dir2)]), size2(box.size[std::max(dir1, dir2)]),
426  blocks(1), block_stride(0), total_size(box.count()), embed({0, 0})
427  {
428  int odir1 = box.find_order(dir1);
429  int odir2 = box.find_order(dir2);
430 
431  if (std::min(odir1, odir2) == 0 and std::max(odir1, odir2) == 1){
432  stride = 1;
433  dist = size * size2;
434  howmanyffts = box.size[2];
435  }else if (std::min(odir1, odir2) == 1 and std::max(odir1, odir2) == 2){
436  stride = box.size[0];
437  dist = 1;
438  howmanyffts = box.size[0];
439  }else{ // case of directions (0, 2)
440  stride = 1;
441  dist = size;
442  embed = {static_cast<int>(box.size[2]), static_cast<int>(box.size[1] * box.size[0])};
443  howmanyffts = box.size[1];
444  }
445  }
447  template<typename index>
448  cufft_executor(cudaStream_t active_stream, box3d<index> const box) :
449  stream(active_stream),
450  size(box.size[0]), size2(box.size[1]), howmanyffts(box.size[2]),
451  stride(0), dist(0),
452  blocks(1), block_stride(0),
453  total_size(box.count()),
454  embed({0, 0})
455  {}
456 
458  void forward(std::complex<float> data[], std::complex<float>*) const override{
459  make_plan(ccomplex_plan);
460  for(int i=0; i<blocks; i++){
461  cufftComplex* block_data = reinterpret_cast<cufftComplex*>(data + i * block_stride);
462  cuda::check_error(cufftExecC2C(*ccomplex_plan, block_data, block_data, CUFFT_FORWARD), "cufft_executor::cufftExecC2C() forward");
463  }
464  }
466  void backward(std::complex<float> data[], std::complex<float>*) const override{
467  make_plan(ccomplex_plan);
468  for(int i=0; i<blocks; i++){
469  cufftComplex* block_data = reinterpret_cast<cufftComplex*>(data + i * block_stride);
470  cuda::check_error(cufftExecC2C(*ccomplex_plan, block_data, block_data, CUFFT_INVERSE), "cufft_executor::cufftExecC2C() backward");
471  }
472  }
474  void forward(std::complex<double> data[], std::complex<double>*) const override{
475  make_plan(zcomplex_plan);
476  for(int i=0; i<blocks; i++){
477  cufftDoubleComplex* block_data = reinterpret_cast<cufftDoubleComplex*>(data + i * block_stride);
478  cuda::check_error(cufftExecZ2Z(*zcomplex_plan, block_data, block_data, CUFFT_FORWARD), "cufft_executor::cufftExecZ2Z() forward");
479  }
480  }
482  void backward(std::complex<double> data[], std::complex<double>*) const override{
483  make_plan(zcomplex_plan);
484  for(int i=0; i<blocks; i++){
485  cufftDoubleComplex* block_data = reinterpret_cast<cufftDoubleComplex*>(data + i * block_stride);
486  cuda::check_error(cufftExecZ2Z(*zcomplex_plan, block_data, block_data, CUFFT_INVERSE), "cufft_executor::cufftExecZ2Z() backward");
487  }
488  }
489 
491  void forward(float const indata[], std::complex<float> outdata[], std::complex<float> *workspace) const override{
492  cuda::convert(stream, total_size, indata, outdata);
493  forward(outdata, workspace);
494  }
496  void backward(std::complex<float> indata[], float outdata[], std::complex<float> *workspace) const{
497  backward(indata, workspace);
498  cuda::convert(stream, total_size, indata, outdata);
499  }
501  void forward(double const indata[], std::complex<double> outdata[], std::complex<double> *workspace) const override{
502  cuda::convert(stream, total_size, indata, outdata);
503  forward(outdata, workspace);
504  }
506  void backward(std::complex<double> indata[], double outdata[], std::complex<double> *workspace) const override{
507  backward(indata, workspace);
508  cuda::convert(stream, total_size, indata, outdata);
509  }
510 
512  int box_size() const override{ return total_size; }
514  size_t workspace_size() const override{ return 0; }
515 
516 private:
518  template<typename scalar_type>
519  void make_plan(std::unique_ptr<plan_cufft<scalar_type>> &plan) const{
520  if (not plan){
521  if (dist == 0)
522  plan = std::unique_ptr<plan_cufft<scalar_type>>(new plan_cufft<scalar_type>(stream, size, size2, howmanyffts));
523  else if (size2 == 0)
524  plan = std::unique_ptr<plan_cufft<scalar_type>>(new plan_cufft<scalar_type>(stream, size, howmanyffts, stride, dist));
525  else
526  plan = std::unique_ptr<plan_cufft<scalar_type>>(new plan_cufft<scalar_type>(stream, size, size2, embed, howmanyffts, stride, dist));
527  }
528  }
529 
530  mutable cudaStream_t stream;
531 
532  int size, size2, howmanyffts, stride, dist, blocks, block_stride, total_size;
533  std::array<int, 2> embed;
534  mutable std::unique_ptr<plan_cufft<std::complex<float>>> ccomplex_plan;
535  mutable std::unique_ptr<plan_cufft<std::complex<double>>> zcomplex_plan;
536 };
537 
544 template<typename scalar_type> struct plan_cufft_r2c{
556  plan_cufft_r2c(cudaStream_t stream, direction dir, int size, int batch, int stride, int rdist, int cdist){
557  static_assert(std::is_same<scalar_type, float>::value or std::is_same<scalar_type, double>::value,
558  "plan_cufft_r2c can be used only with scalar_type float or double.");
559 
560  size_t work_size = 0;
561  cuda::check_error(cufftCreate(&plan), "plan_cufft_r2c::cufftCreate()");
562 
564  cufftMakePlanMany(plan, 1, &size, &size, stride,
565  (dir == direction::forward) ? rdist : cdist,
566  &size, stride,
567  (dir == direction::forward) ? cdist : rdist,
568  (std::is_same<scalar_type, float>::value) ?
569  (dir == direction::forward) ? CUFFT_R2C : CUFFT_C2R
570  : (dir == direction::forward) ? CUFFT_D2Z : CUFFT_Z2D,
571  batch, &work_size),
572  "plan_cufft_r2c::cufftMakePlanMany() (forward)"
573  );
574  if (stream != nullptr)
575  cuda::check_error( cufftSetStream(plan, stream), "cufftSetStream()");
576  }
578  ~plan_cufft_r2c(){ cufftDestroy(plan); }
580  operator cufftHandle() const{ return plan; }
581 
582 private:
584  cufftHandle plan;
585 };
586 
596 public:
606  template<typename index>
607  cufft_executor_r2c(cudaStream_t active_stream, box3d<index> const box, int dimension) :
608  stream(active_stream),
609  size(box.size[dimension]),
610  howmanyffts(fft1d_get_howmany(box, dimension)),
611  stride(fft1d_get_stride(box, dimension)),
612  blocks((dimension == box.order[1]) ? box.osize(2) : 1),
613  rdist((dimension == box.order[0]) ? size : 1),
614  cdist((dimension == box.order[0]) ? size/2 + 1 : 1),
615  rblock_stride(box.osize(0) * box.osize(1)),
616  cblock_stride(box.osize(0) * (box.osize(1)/2 + 1)),
617  rsize(box.count()),
618  csize(box.r2c(dimension).count())
619  {}
620 
622  void forward(float const indata[], std::complex<float> outdata[], std::complex<float>*) const override{
623  make_plan(sforward, direction::forward);
624  if (blocks == 1 or rblock_stride % 2 == 0){
625  for(int i=0; i<blocks; i++){
626  cufftReal *rdata = const_cast<cufftReal*>(indata + i * rblock_stride);
627  cufftComplex* cdata = reinterpret_cast<cufftComplex*>(outdata + i * cblock_stride);
628  cuda::check_error(cufftExecR2C(*sforward, rdata, cdata), "cufft_executor::cufftExecR2C()");
629  }
630  }else{
631  // need to create a temporary copy of the data since cufftExecR2C() requires aligned input
632  backend::buffer_traits<backend::cufft>::container<float> rdata(stream, rblock_stride);
633  for(int i=0; i<blocks; i++){
634  backend::data_manipulator<tag::gpu>::copy_n(stream, indata + i * rblock_stride, rblock_stride, rdata.data());
635  cufftComplex* cdata = reinterpret_cast<cufftComplex*>(outdata + i * cblock_stride);
636  cuda::check_error(cufftExecR2C(*sforward, rdata.data(), cdata), "cufft_executor::cufftExecR2C()");
637  }
638  }
639  }
641  void backward(std::complex<float> indata[], float outdata[], std::complex<float>*) const override{
642  make_plan(sbackward, direction::backward);
643  if (blocks == 1 or rblock_stride % 2 == 0){
644  for(int i=0; i<blocks; i++){
645  cufftComplex* cdata = const_cast<cufftComplex*>(reinterpret_cast<cufftComplex const*>(indata + i * cblock_stride));
646  cuda::check_error(cufftExecC2R(*sbackward, cdata, outdata + i * rblock_stride), "cufft_executor::cufftExecC2R()");
647  }
648  }else{
649  backend::buffer_traits<backend::cufft>::container<float> odata(stream, rblock_stride);
650  for(int i=0; i<blocks; i++){
651  cufftComplex* cdata = const_cast<cufftComplex*>(reinterpret_cast<cufftComplex const*>(indata + i * cblock_stride));
652  cuda::check_error(cufftExecC2R(*sbackward, cdata, odata.data()), "cufft_executor::cufftExecC2R()");
653  backend::data_manipulator<tag::gpu>::copy_n(stream, odata.data(), rblock_stride, outdata + i * rblock_stride);
654  }
655  }
656  }
658  void forward(double const indata[], std::complex<double> outdata[], std::complex<double>*) const override{
659  make_plan(dforward, direction::forward);
660  if (blocks == 1 or rblock_stride % 2 == 0){
661  for(int i=0; i<blocks; i++){
662  cufftDoubleReal *rdata = const_cast<cufftDoubleReal*>(indata + i * rblock_stride);
663  cufftDoubleComplex* cdata = reinterpret_cast<cufftDoubleComplex*>(outdata + i * cblock_stride);
664  cuda::check_error(cufftExecD2Z(*dforward, rdata, cdata), "cufft_executor::cufftExecD2Z()");
665  }
666  }else{
667  backend::buffer_traits<backend::cufft>::container<double> rdata(stream, rblock_stride);
668  for(int i=0; i<blocks; i++){
669  backend::data_manipulator<tag::gpu>::copy_n(stream, indata + i * rblock_stride, rblock_stride, rdata.data());
670  cufftDoubleComplex* cdata = reinterpret_cast<cufftDoubleComplex*>(outdata + i * cblock_stride);
671  cuda::check_error(cufftExecD2Z(*dforward, rdata.data(), cdata), "cufft_executor::cufftExecD2Z()");
672  }
673  }
674  }
676  void backward(std::complex<double> indata[], double outdata[], std::complex<double>*) const override{
677  make_plan(dbackward, direction::backward);
678  if (blocks == 1 or rblock_stride % 2 == 0){
679  for(int i=0; i<blocks; i++){
680  cufftDoubleComplex* cdata = const_cast<cufftDoubleComplex*>(reinterpret_cast<cufftDoubleComplex const*>(indata + i * cblock_stride));
681  cuda::check_error(cufftExecZ2D(*dbackward, cdata, outdata + i * rblock_stride), "cufft_executor::cufftExecZ2D()");
682  }
683  }else{
684  backend::buffer_traits<backend::cufft>::container<double> odata(stream, rblock_stride);
685  for(int i=0; i<blocks; i++){
686  cufftDoubleComplex* cdata = const_cast<cufftDoubleComplex*>(reinterpret_cast<cufftDoubleComplex const*>(indata + i * cblock_stride));
687  cuda::check_error(cufftExecZ2D(*dbackward, cdata, odata.data()), "cufft_executor::cufftExecZ2D()");
688  backend::data_manipulator<tag::gpu>::copy_n(stream, odata.data(), rblock_stride, outdata + i * rblock_stride);
689  }
690  }
691  }
692 
694  int box_size() const override{ return rsize; }
696  int complex_size() const override{ return csize; }
698  size_t workspace_size() const override{ return 0; }
699 
700 private:
702  template<typename scalar_type>
703  void make_plan(std::unique_ptr<plan_cufft_r2c<scalar_type>> &plan, direction dir) const{
704  if (!plan) plan = std::unique_ptr<plan_cufft_r2c<scalar_type>>(new plan_cufft_r2c<scalar_type>(stream, dir, size, howmanyffts, stride, rdist, cdist));
705  }
706 
707  cudaStream_t stream;
708 
709  int size, howmanyffts, stride, blocks;
710  int rdist, cdist, rblock_stride, cblock_stride, rsize, csize;
711  mutable std::unique_ptr<plan_cufft_r2c<float>> sforward;
712  mutable std::unique_ptr<plan_cufft_r2c<double>> dforward;
713  mutable std::unique_ptr<plan_cufft_r2c<float>> sbackward;
714  mutable std::unique_ptr<plan_cufft_r2c<double>> dbackward;
715 };
716 
723 template<> struct one_dim_backend<backend::cufft>{
728 };
729 
736 template<> struct one_dim_backend<backend::cufft_cos>{
740  using executor_r2c = void;
741 };
748 template<> struct one_dim_backend<backend::cufft_sin>{
752  using executor_r2c = void;
753 };
754 
759 template<> struct direct_packer<tag::gpu>{
761  template<typename scalar_type, typename index>
762  void pack(cudaStream_t stream, pack_plan_3d<index> const &plan, scalar_type const data[], scalar_type buffer[]) const{
763  cuda::direct_pack(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride, data, buffer);
764  }
766  template<typename scalar_type, typename index>
767  void unpack(cudaStream_t stream, pack_plan_3d<index> const &plan, scalar_type const buffer[], scalar_type data[]) const{
768  cuda::direct_unpack(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride, buffer, data);
769  }
770 };
771 
776 template<> struct transpose_packer<tag::gpu>{
778  template<typename scalar_type, typename index>
779  void pack(cudaStream_t stream, pack_plan_3d<index> const &plan, scalar_type const data[], scalar_type buffer[]) const{
780  direct_packer<tag::gpu>().pack(stream, plan, data, buffer); // packing is done the same way as the direct_packer
781  }
783  template<typename scalar_type, typename index>
784  void unpack(cudaStream_t stream, pack_plan_3d<index> const &plan, scalar_type const buffer[], scalar_type data[]) const{
785  cuda::transpose_unpack<scalar_type>(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride,
786  plan.buff_line_stride, plan.buff_plane_stride, plan.map[0], plan.map[1], plan.map[2], buffer, data);
787  }
788 };
789 
790 namespace data_scaling {
795  template<typename scalar_type, typename index>
796  void apply(cudaStream_t stream, index num_entries, scalar_type *data, double scale_factor){
797  cuda::scale_data(stream, static_cast<long long>(num_entries), data, scale_factor);
798  }
803  template<typename precision_type, typename index>
804  void apply(cudaStream_t stream, index num_entries, std::complex<precision_type> *data, double scale_factor){
805  apply<precision_type>(stream, 2*num_entries, reinterpret_cast<precision_type*>(data), scale_factor);
806  }
807 };
808 
813 template<> struct default_plan_options<backend::cufft>{
815  static const bool use_reorder = false;
816 };
821 template<> struct default_plan_options<backend::cufft_cos>{
823  static const bool use_reorder = true;
824 };
829 template<> struct default_plan_options<backend::cufft_sin>{
831  static const bool use_reorder = true;
832 };
833 
834 }
835 
836 #endif
837 
838 #endif /* HEFFTE_BACKEND_FFTW_H */
Wrapper to cuFFT API for real-to-complex transform with shortening of the data.
Definition: heffte_backend_cuda.h:595
cufft_executor_r2c(cudaStream_t active_stream, box3d< index > const box, int dimension)
Constructor defines the box and the dimension of reduction.
Definition: heffte_backend_cuda.h:607
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *) const override
Backward transform, single precision.
Definition: heffte_backend_cuda.h:641
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *) const override
Forward transform, single precision.
Definition: heffte_backend_cuda.h:622
size_t workspace_size() const override
Return the size of the needed workspace.
Definition: heffte_backend_cuda.h:698
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *) const override
Forward transform, double precision.
Definition: heffte_backend_cuda.h:658
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *) const override
Backward transform, double precision.
Definition: heffte_backend_cuda.h:676
int box_size() const override
Returns the size of the box with real data.
Definition: heffte_backend_cuda.h:694
int complex_size() const override
Returns the size of the box with complex coefficients.
Definition: heffte_backend_cuda.h:696
Wrapper around the cuFFT API.
Definition: heffte_backend_cuda.h:400
virtual void backward(float[], float *) const
Bring forth method that have not been overloaded.
Definition: heffte_common.h:495
void forward(std::complex< float > data[], std::complex< float > *) const override
Forward fft, float-complex case.
Definition: heffte_backend_cuda.h:458
void backward(std::complex< float > data[], std::complex< float > *) const override
Backward fft, float-complex case.
Definition: heffte_backend_cuda.h:466
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *workspace) const override
Performs backward double-complex transform and truncates the complex part of the result.
Definition: heffte_backend_cuda.h:506
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *workspace) const override
Converts the deal data to complex and performs float-complex forward transform.
Definition: heffte_backend_cuda.h:491
cufft_executor(cudaStream_t active_stream, box3d< index > const box)
Merges three FFTs into one.
Definition: heffte_backend_cuda.h:448
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *workspace) const
Performs backward float-complex transform and truncates the complex part of the result.
Definition: heffte_backend_cuda.h:496
virtual void forward(float[], float *) const
Bring forth method that have not been overloaded.
Definition: heffte_common.h:491
size_t workspace_size() const override
Return the size of the needed workspace.
Definition: heffte_backend_cuda.h:514
cufft_executor(cudaStream_t active_stream, box3d< index > const box, int dimension)
Constructor, specifies the box and dimension.
Definition: heffte_backend_cuda.h:410
cufft_executor(cudaStream_t active_stream, box3d< index > const box, int dir1, int dir2)
Merges two FFTs into one.
Definition: heffte_backend_cuda.h:423
void backward(std::complex< double > data[], std::complex< double > *) const override
Backward fft, double-complex case.
Definition: heffte_backend_cuda.h:482
int box_size() const override
Returns the size of the box.
Definition: heffte_backend_cuda.h:512
void forward(std::complex< double > data[], std::complex< double > *) const override
Forward fft, double-complex case.
Definition: heffte_backend_cuda.h:474
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *workspace) const override
Converts the deal data to complex and performs double-complex forward transform.
Definition: heffte_backend_cuda.h:501
Base class for all backend executors.
Definition: heffte_common.h:486
virtual int complex_size() const
Return the size of the complex-box (r2c executors).
Definition: heffte_common.h:519
virtual void backward(float[], float *) const
Backward r2r, single precision.
Definition: heffte_common.h:495
virtual void forward(float[], float *) const
Forward r2r, single precision.
Definition: heffte_common.h:491
int fft1d_get_howmany(box3d< index > const box, int const dimension)
Return the number of 1-D ffts contained in the box in the given dimension.
Definition: heffte_geometry.h:159
int fft1d_get_stride(box3d< index > const box, int const dimension)
Return the stride of the 1-D ffts contained in the box in the given dimension.
Definition: heffte_geometry.h:169
direction
Indicates the direction of the FFT (internal use only).
Definition: heffte_common.h:535
@ backward
Inverse DFT transform.
@ forward
Forward DFT transform.
void apply(cudaStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Simply multiply the num_entries in the data by the scale_factor.
Definition: heffte_backend_cuda.h:796
void transpose_unpack(cudaStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, index buff_line_stride, index buff_plane_stride, int map0, int map1, int map2, scalar_type const source[], scalar_type destination[])
Performs a transpose-unpack operation for data sitting on the GPU device.
void check_error(cudaError_t status, const char *function_name)
Checks the status of a CUDA command and in case of a failure, converts it to a C++ exception.
Definition: heffte_backend_cuda.h:49
void scale_data(cudaStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Scales real data (double or float) by the scaling factor.
void direct_pack(cudaStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-pack operation for data sitting on the GPU device.
void direct_unpack(cudaStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-unpack operation for data sitting on the GPU device.
void convert(cudaStream_t stream, index num_entries, precision_type const source[], std::complex< precision_type > destination[])
Convert real numbers to complex when both are located on the GPU device.
Namespace containing all HeFFTe methods and classes.
Definition: heffte_backend_cuda.h:38
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the cuda vector container.
Definition: heffte_backend_cuda.h:267
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the cuda vector container.
Definition: heffte_backend_cuda.h:278
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the cuda vector container.
Definition: heffte_backend_cuda.h:289
Defines the container for the temporary buffers.
Definition: heffte_common.h:212
std::vector< T > container
Defines the container template to use for the temporary buffers in heffte::fft3d.
Definition: heffte_common.h:216
Type-tag for the Cosine Transform using the cuFFT backend.
Definition: heffte_common.h:153
Type-tag for the Sine Transform using the cuFFT backend.
Definition: heffte_common.h:158
Type-tag for the cuFFT backend.
Definition: heffte_common.h:147
cudaStream_t stream_type
The stream type for the device.
Definition: heffte_backend_cuda.h:202
static void copy_n(cudaStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Equivalent to std::copy_n() but using CUDA arrays.
Definition: heffte_backend_cuda.h:222
static void free(cudaStream_t stream, scalar_type *pntr)
Free memory.
Definition: heffte_backend_cuda.h:215
static void copy_n(cudaStream_t stream, scalar_type const source[], size_t num_entries, std::complex< scalar_type > destination[])
Copy-convert real-to-complex.
Definition: heffte_backend_cuda.h:235
static void copy_device_to_device(cudaStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the device.
Definition: heffte_backend_cuda.h:246
static void copy_host_to_device(cudaStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the host to the device.
Definition: heffte_backend_cuda.h:252
static void copy_device_to_host(cudaStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the host.
Definition: heffte_backend_cuda.h:240
static void copy_n(cudaStream_t stream, std::complex< scalar_type > const source[], size_t num_entries, scalar_type destination[])
Copy-convert complex-to-real.
Definition: heffte_backend_cuda.h:230
static scalar_type * allocate(cudaStream_t stream, size_t num_entries)
Allocate memory.
Definition: heffte_backend_cuda.h:207
Common data-transfer operations, must be specializes for each location (cpu/gpu).
Definition: heffte_common.h:59
Defines inverse mapping from the location tag to a default backend tag.
Definition: heffte_common.h:380
The CUDA backend uses a CUDA stream.
Definition: heffte_backend_cuda.h:172
cudaStream_t stream() const
Returns the nullptr (const case).
Definition: heffte_backend_cuda.h:178
void synchronize_device() const
Syncs the execution with the queue, no-op in the CPU case.
Definition: heffte_backend_cuda.h:180
cudaStream_t _stream
The CUDA stream to be used in all operations.
Definition: heffte_backend_cuda.h:182
cudaStream_t stream_type
The type for the internal stream.
Definition: heffte_backend_cuda.h:184
cudaStream_t stream()
Returns the nullptr.
Definition: heffte_backend_cuda.h:176
device_instance(cudaStream_t new_stream=nullptr)
Constructor, sets up the stream.
Definition: heffte_backend_cuda.h:174
Holds the auxiliary variables needed by each backend.
Definition: heffte_common.h:358
Allows to define whether a specific backend interface has been enabled.
Definition: heffte_common.h:201
A generic container that describes a 3d box of indexes.
Definition: heffte_geometry.h:67
Implementation of Cosine Transform pre-post processing methods using CUDA.
Definition: heffte_backend_cuda.h:116
static void post_backward(cudaStream_t, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
static void pre_backward(cudaStream_t, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
static void pre_forward(cudaStream_t, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
static void post_forward(cudaStream_t, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
Implementation of Sine Transform pre-post processing methods using CUDA.
Definition: heffte_backend_cuda.h:134
static void post_forward(cudaStream_t, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
static void post_backward(cudaStream_t, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
static void pre_backward(cudaStream_t, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
static void pre_forward(cudaStream_t, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
Defines a set of default plan options for a given backend.
Definition: heffte_common.h:642
Simple packer that copies sub-boxes without transposing the order of the indexes.
Definition: heffte_backend_cuda.h:759
void pack(cudaStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition: heffte_backend_cuda.h:762
void unpack(cudaStream_t stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned unpack operation.
Definition: heffte_backend_cuda.h:767
Defines the direct packer without implementation, use the specializations to get the CPU or GPU imple...
Definition: heffte_pack3d.h:83
Struct to specialize to allow HeFFTe to recognize custom single precision complex types.
Definition: heffte_utils.h:251
Struct to specialize to allow HeFFTe to recognize custom double precision complex types.
Definition: heffte_utils.h:269
void executor_r2c
Defines the real-to-complex executor.
Definition: heffte_backend_cuda.h:740
void executor_r2c
Defines the real-to-complex executor.
Definition: heffte_backend_cuda.h:752
Indicates the structure that will be used by the fft backend.
Definition: heffte_common.h:546
Holds the plan for a pack/unpack operation.
Definition: heffte_pack3d.h:32
index buff_plane_stride
Stride of the planes in the received buffer (transpose packing only).
Definition: heffte_pack3d.h:42
index line_stride
Stride of the lines.
Definition: heffte_pack3d.h:36
index plane_stride
Stride of the planes.
Definition: heffte_pack3d.h:38
std::array< index, 3 > size
Number of elements in the three directions.
Definition: heffte_pack3d.h:34
std::array< int, 3 > map
Maps the i,j,k indexes from input to the output (transpose packing only).
Definition: heffte_pack3d.h:44
index buff_line_stride
Stride of the lines in the received buffer (transpose packing only).
Definition: heffte_pack3d.h:40
Plan for the r2c single and double precision transform.
Definition: heffte_backend_cuda.h:544
plan_cufft_r2c(cudaStream_t stream, direction dir, int size, int batch, int stride, int rdist, int cdist)
Constructor, takes inputs identical to cufftMakePlanMany().
Definition: heffte_backend_cuda.h:556
~plan_cufft_r2c()
Destructor, deletes the plan.
Definition: heffte_backend_cuda.h:578
Wrapper around cufftHandle plans, set for float or double complex.
Definition: heffte_backend_cuda.h:310
plan_cufft(cudaStream_t stream, int size1, int size2, int size3)
Constructor, takes inputs identical to cufftPlan3d()
Definition: heffte_backend_cuda.h:369
plan_cufft(cudaStream_t stream, int size, int batch, int stride, int dist)
Constructor, takes inputs identical to cufftMakePlanMany().
Definition: heffte_backend_cuda.h:320
plan_cufft(cudaStream_t stream, int size1, int size2, std::array< int, 2 > const &embed, int batch, int stride, int dist)
Constructor, takes inputs identical to cufftMakePlanMany().
Definition: heffte_backend_cuda.h:344
~plan_cufft()
Destructor, deletes the plan.
Definition: heffte_backend_cuda.h:379
Template algorithm for the Sine and Cosine transforms.
Definition: heffte_r2r_executor.h:135
Indicates the use of gpu backend and that all input/output data and arrays will be bound to the gpu d...
Definition: heffte_common.h:45
void pack(cudaStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition: heffte_backend_cuda.h:779
void unpack(cudaStream_t stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned transpose-unpack operation.
Definition: heffte_backend_cuda.h:784
Defines the transpose packer without implementation, use the specializations to get the CPU implement...
Definition: heffte_pack3d.h:116