Highly Efficient FFT for Exascale: HeFFTe v2.3
heffte_backend_rocm.h
1 /*
2  -- heFFTe --
3  Univ. of Tennessee, Knoxville
4  @date
5 */
6 
7 #ifndef HEFFTE_BACKEND_ROCM_H
8 #define HEFFTE_BACKEND_ROCM_H
9 
10 #include "heffte_r2r_executor.h"
11 
12 #ifdef Heffte_ENABLE_ROCM
13 
14 #ifndef __HIP_PLATFORM_HCC__
15 #define __HIP_PLATFORM_HCC__
16 #endif
17 #include <hip/hip_runtime.h>
18 #include <rocfft.h>
19 #include "heffte_backend_vector.h"
20 
21 #ifdef Heffte_ENABLE_MAGMA
22 #include "heffte_magma_helpers.h"
23 #endif
24 
40 namespace heffte{
41 
46 namespace rocm {
51  inline void check_error(hipError_t status, const char *function_name){
52  if (status != hipSuccess)
53  throw std::runtime_error(std::string(function_name) + " failed with message: " + std::string(hipGetErrorString(status)));
54  }
59  inline void check_error(rocfft_status status, const char *function_name){
60  if (status != rocfft_status_success)
61  throw std::runtime_error(std::string(function_name) + " failed with error code: " + std::to_string(status));
62  }
63 
70  template<typename precision_type, typename index>
71  void convert(hipStream_t stream, index num_entries, precision_type const source[], std::complex<precision_type> destination[]);
78  template<typename precision_type, typename index>
79  void convert(hipStream_t stream, index num_entries, std::complex<precision_type> const source[], precision_type destination[]);
80 
85  template<typename scalar_type, typename index>
86  void scale_data(hipStream_t stream, index num_entries, scalar_type *data, double scale_factor);
87 
94  template<typename scalar_type, typename index>
95  void direct_pack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide,
96  scalar_type const source[], scalar_type destination[]);
103  template<typename scalar_type, typename index>
104  void direct_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide,
105  scalar_type const source[], scalar_type destination[]);
112  template<typename scalar_type, typename index>
113  void transpose_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide,
114  index buff_line_stride, index buff_plane_stride, int map0, int map1, int map2,
115  scalar_type const source[], scalar_type destination[]);
116 
123  template<typename precision>
124  static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[]);
126  template<typename precision>
127  static void post_forward(hipStream_t, int length, std::complex<precision> const fft_result[], precision result[]);
129  template<typename precision>
130  static void pre_backward(hipStream_t, int length, precision const input[], std::complex<precision> fft_signal[]);
132  template<typename precision>
133  static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[]);
134  };
141  template<typename precision>
142  static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[]);
144  template<typename precision>
145  static void post_forward(hipStream_t, int length, std::complex<precision> const fft_result[], precision result[]);
147  template<typename precision>
148  static void pre_backward(hipStream_t, int length, precision const input[], std::complex<precision> fft_signal[]);
150  template<typename precision>
151  static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[]);
152  };
153 }
154 
155 namespace backend{
160  template<> struct is_enabled<rocfft> : std::true_type{};
165  template<> struct is_enabled<rocfft_cos> : std::true_type{};
170  template<> struct is_enabled<rocfft_sin> : std::true_type{};
171 
176  template<>
177  struct device_instance<tag::gpu>{
179  device_instance(hipStream_t new_stream = nullptr) : _stream(new_stream){}
181  hipStream_t stream(){ return _stream; }
183  hipStream_t stream() const{ return _stream; }
185  void synchronize_device() const{ rocm::check_error(hipStreamSynchronize(_stream), "device sync"); }
187  mutable hipStream_t _stream;
189  using stream_type = hipStream_t;
190  };
191 
196  template<> struct default_backend<tag::gpu>{
198  using type = rocfft;
199  };
200 
205  template<> struct data_manipulator<tag::gpu> {
207  using stream_type = hipStream_t;
211  template<typename scalar_type>
212  static scalar_type* allocate(hipStream_t stream, size_t num_entries){
213  void *new_data;
214  if (stream != nullptr) rocm::check_error( hipStreamSynchronize(stream), "hipStreamSynchronize()");
215  rocm::check_error(hipMalloc(&new_data, num_entries * sizeof(scalar_type)), "hipMalloc()");
216  return reinterpret_cast<scalar_type*>(new_data);
217  }
219  template<typename scalar_type>
220  static void free(hipStream_t stream, scalar_type *pntr){
221  if (pntr == nullptr) return;
222  if (stream != nullptr) rocm::check_error( hipStreamSynchronize(stream), "hipStreamSynchronize()");
223  rocm::check_error(hipFree(pntr), "hipFree()");
224  }
226  template<typename scalar_type>
227  static void copy_n(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
228  if (stream == nullptr)
229  rocm::check_error(hipMemcpy(destination, source, num_entries * sizeof(scalar_type), hipMemcpyDeviceToDevice), "data_manipulator::copy_n()");
230  else
231  rocm::check_error(hipMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), hipMemcpyDeviceToDevice, stream), "data_manipulator::copy_n()");
232  }
234  template<typename scalar_type>
235  static void copy_n(hipStream_t stream, std::complex<scalar_type> const source[], size_t num_entries, scalar_type destination[]){
236  rocm::convert(stream, static_cast<long long>(num_entries), source, destination);
237  }
239  template<typename scalar_type>
240  static void copy_n(hipStream_t stream, scalar_type const source[], size_t num_entries, std::complex<scalar_type> destination[]){
241  rocm::convert(stream, static_cast<long long>(num_entries), source, destination);
242  }
244  template<typename scalar_type>
245  static void copy_device_to_host(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
246  rocm::check_error(hipMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), hipMemcpyDeviceToHost, stream),
247  "device_to_host (rocm)");
248  }
250  template<typename scalar_type>
251  static void copy_device_to_device(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
252  rocm::check_error(hipMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), hipMemcpyDeviceToDevice, stream),
253  "device_to_device (rocm)");
254  }
256  template<typename scalar_type>
257  static void copy_host_to_device(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
258  rocm::check_error(hipMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), hipMemcpyHostToDevice, stream),
259  "host_to_device (rocm)");
260  }
261  };
262 
267  template<>
272  template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
273  };
278  template<>
283  template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
284  };
289  template<>
294  template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
295  };
296 }
297 
305 template<typename precision_type, direction dir>
306 struct plan_rocfft{
316  plan_rocfft(size_t size, size_t batch, size_t stride, size_t rdist, size_t cdist){
317 
318  rocfft_plan_description desc = nullptr;
319  rocm::check_error( rocfft_plan_description_create(&desc), "rocm plan create");
320 
322  rocfft_plan_description_set_data_layout(
323  desc,
324  (dir == direction::forward) ? rocfft_array_type_real : rocfft_array_type_hermitian_interleaved,
325  (dir == direction::forward) ? rocfft_array_type_hermitian_interleaved : rocfft_array_type_real,
326  nullptr, nullptr,
327  1, &stride, (dir == direction::forward) ? rdist : cdist,
328  1, &stride, (dir == direction::forward) ? cdist : rdist
329  ),
330  "plan layout"
331  );
332 
334  rocfft_plan_create(&plan, rocfft_placement_notinplace,
335  (dir == direction::forward) ? rocfft_transform_type_real_forward : rocfft_transform_type_real_inverse,
336  (std::is_same<precision_type, float>::value)? rocfft_precision_single : rocfft_precision_double,
337  1, &size, batch, desc),
338  "plan create");
339 
340  rocm::check_error( rocfft_plan_get_work_buffer_size(plan, &worksize), "get_worksize");
341 
342  rocm::check_error( rocfft_plan_description_destroy(desc), "rocm plan destroy");
343  }
345  ~plan_rocfft(){ rocfft_plan_destroy(plan); }
347  operator rocfft_plan() const{ return plan; }
349  size_t size_work() const{ return worksize; }
350 
351 private:
353  rocfft_plan plan;
355  size_t worksize;
356 };
357 
362 template<typename precision_type, direction dir>
363 struct plan_rocfft<std::complex<precision_type>, dir>{
372  plan_rocfft(size_t size, size_t batch, size_t stride, size_t dist) : plan(nullptr), worksize(0){
373  rocfft_plan_description desc = nullptr;
374  rocm::check_error( rocfft_plan_description_create(&desc), "rocm plan create");
375 
377  rocfft_plan_description_set_data_layout(
378  desc,
379  rocfft_array_type_complex_interleaved,
380  rocfft_array_type_complex_interleaved,
381  nullptr, nullptr,
382  1, &stride, dist, 1, &stride, dist
383  ),
384  "plan layout"
385  );
386 
388  rocfft_plan_create(&plan, rocfft_placement_inplace,
389  (dir == direction::forward) ? rocfft_transform_type_complex_forward : rocfft_transform_type_complex_inverse,
390  (std::is_same<precision_type, float>::value)? rocfft_precision_single : rocfft_precision_double,
391  1, &size, batch, desc),
392  "plan create");
393 
394  rocm::check_error( rocfft_plan_get_work_buffer_size(plan, &worksize), "get_worksize");
395 
396  rocm::check_error( rocfft_plan_description_destroy(desc), "rocm plan destroy");
397  }
407  plan_rocfft(size_t size1, size_t size2, std::array<size_t, 2> const &embed, size_t batch, size_t dist) : plan(nullptr), worksize(0){
408  size_t size[2] = {size1, size2};
409 
410  rocfft_plan_description desc = nullptr;
411  rocm::check_error( rocfft_plan_description_create(&desc), "rocm plan create");
412 
414  rocfft_plan_description_set_data_layout(
415  desc,
416  rocfft_array_type_complex_interleaved,
417  rocfft_array_type_complex_interleaved,
418  nullptr, nullptr,
419  2, embed.data(), dist, 2, embed.data(), dist
420  ),
421  "plan layout"
422  );
423 
425  rocfft_plan_create(&plan, rocfft_placement_inplace,
426  (dir == direction::forward) ? rocfft_transform_type_complex_forward : rocfft_transform_type_complex_inverse,
427  (std::is_same<precision_type, float>::value) ? rocfft_precision_single : rocfft_precision_double,
428  2, size, batch, desc),
429  "plan create");
430 
431  rocm::check_error( rocfft_plan_get_work_buffer_size(plan, &worksize), "get_worksize");
432 
433  rocm::check_error( rocfft_plan_description_destroy(desc), "rocm plan destroy");
434  }
436  plan_rocfft(size_t size1, size_t size2, size_t size3){
437  std::array<size_t, 3> size = {size1, size2, size3};
438  rocfft_plan_description desc = nullptr;
439  rocm::check_error( rocfft_plan_description_create(&desc), "rocm plan create");
440 
442  rocfft_plan_description_set_data_layout(
443  desc,
444  rocfft_array_type_complex_interleaved,
445  rocfft_array_type_complex_interleaved,
446  nullptr, nullptr, 3, nullptr, 1, 3, nullptr, 1
447  ),
448  "plan layout"
449  );
450 
452  rocfft_plan_create(&plan, rocfft_placement_inplace,
453  (dir == direction::forward) ? rocfft_transform_type_complex_forward : rocfft_transform_type_complex_inverse,
454  (std::is_same<precision_type, float>::value) ? rocfft_precision_single : rocfft_precision_double,
455  3, size.data(), 1, desc),
456  "plan create 3d");
457 
458  rocm::check_error( rocfft_plan_get_work_buffer_size(plan, &worksize), "get_worksize");
459 
460  rocm::check_error( rocfft_plan_description_destroy(desc), "rocm plan destroy");
461  }
463  ~plan_rocfft(){ rocm::check_error( rocfft_plan_destroy(plan), "plan destory"); }
465  operator rocfft_plan() const{ return plan; }
467  size_t size_work() const{ return worksize; }
468 
469 private:
471  rocfft_plan plan;
472  size_t worksize;
473 };
474 
488 public:
496  template<typename index>
497  rocfft_executor(hipStream_t active_stream, box3d<index> const box, int dimension) :
498  stream(active_stream),
499  size(box.size[dimension]), size2(0),
500  howmanyffts(fft1d_get_howmany(box, dimension)),
501  stride(fft1d_get_stride(box, dimension)),
502  dist((dimension == box.order[0]) ? size : 1),
503  blocks((dimension == box.order[1]) ? box.osize(2) : 1),
504  block_stride(box.osize(0) * box.osize(1)),
505  total_size(box.count()),
506  embed({0, 0}),
507  worksize(compute_workspace_size())
508  {}
510  template<typename index>
511  rocfft_executor(hipStream_t active_stream, box3d<index> const box, int dir1, int dir2) :
512  stream(active_stream),
513  size(box.size[std::min(dir1, dir2)]), size2(box.size[std::max(dir1, dir2)]),
514  blocks(1), block_stride(0), total_size(box.count()),
515  worksize(0)
516  {
517  int odir1 = box.find_order(dir1);
518  int odir2 = box.find_order(dir2);
519 
520  if (std::min(odir1, odir2) == 0 and std::max(odir1, odir2) == 1){
521  stride = 1;
522  dist = size * size2;
523  embed = {static_cast<size_t>(stride), static_cast<size_t>(size)};
524  howmanyffts = box.size[2];
525  }else if (std::min(odir1, odir2) == 1 and std::max(odir1, odir2) == 2){
526  stride = box.size[0];
527  dist = 1;
528  embed = {static_cast<size_t>(stride), static_cast<size_t>(size) * static_cast<size_t>(stride)};
529  howmanyffts = box.size[0];
530  }else{ // case of directions (0, 2)
531  stride = 1;
532  dist = size;
533  embed = {static_cast<size_t>(stride), static_cast<size_t>(box.size[1]) * static_cast<size_t>(box.size[0])};
534  howmanyffts = box.size[1];
535  }
536  worksize = compute_workspace_size();
537  }
539  template<typename index>
540  rocfft_executor(hipStream_t active_stream, box3d<index> const box) :
541  stream(active_stream),
542  size(box.size[0]), size2(box.size[1]), howmanyffts(box.size[2]),
543  stride(0), dist(0),
544  blocks(1), block_stride(0),
545  total_size(box.count()),
546  embed({0, 0}),
547  worksize(compute_workspace_size())
548  {}
549 
551  template<typename precision_type, direction dir>
552  void execute(std::complex<precision_type> data[], std::complex<precision_type> *workspace) const{
553  if (std::is_same<precision_type, float>::value){
554  if (dir == direction::forward)
555  make_plan(ccomplex_forward);
556  else
557  make_plan(ccomplex_backward);
558  }else{
559  if (dir == direction::forward)
560  make_plan(zcomplex_forward);
561  else
562  make_plan(zcomplex_backward);
563  }
564  rocfft_execution_info info;
565  rocfft_execution_info_create(&info);
566  rocfft_execution_info_set_stream(info, stream);
567 
568  size_t wsize = (std::is_same<precision_type, float>::value) ?
569  ((dir == direction::forward) ? ccomplex_forward->size_work() : ccomplex_backward->size_work()) :
570  ((dir == direction::forward) ? zcomplex_forward->size_work() : zcomplex_backward->size_work());
571 
572  if (wsize > 0)
573  rocfft_execution_info_set_work_buffer(info, reinterpret_cast<void*>(workspace), wsize);
574 
575  for(int i=0; i<blocks; i++){
576  void* block_data = reinterpret_cast<void*>(data + i * block_stride);
577  rocm::check_error( rocfft_execute(
578  (std::is_same<precision_type, float>::value) ?
579  ((dir == direction::forward) ? *ccomplex_forward : *ccomplex_backward) :
580  ((dir == direction::forward) ? *zcomplex_forward : *zcomplex_backward),
581  &block_data, nullptr, info), "rocfft execute");
582  }
583  rocfft_execution_info_destroy(info);
584  }
585 
587  void forward(std::complex<float> data[], std::complex<float> *workspace) const override{
588  execute<float, direction::forward>(data, workspace);
589  }
591  void forward(std::complex<double> data[], std::complex<double> *workspace) const override{
592  execute<double, direction::forward>(data, workspace);
593  }
595  void backward(std::complex<float> data[], std::complex<float> *workspace) const override{
596  execute<float, direction::backward>(data, workspace);
597  }
599  void backward(std::complex<double> data[], std::complex<double> *workspace) const override{
600  execute<double, direction::backward>(data, workspace);
601  }
602 
604  void forward(float const indata[], std::complex<float> outdata[], std::complex<float> *workspace) const override{
605  rocm::convert(stream, total_size, indata, outdata);
606  forward(outdata, workspace);
607  }
609  void forward(double const indata[], std::complex<double> outdata[], std::complex<double> *workspace) const override{
610  rocm::convert(stream, total_size, indata, outdata);
611  forward(outdata, workspace);
612  }
614  void backward(std::complex<float> indata[], float outdata[], std::complex<float> *workspace) const override{
615  backward(indata, workspace);
616  rocm::convert(stream, total_size, indata, outdata);
617  }
619  void backward(std::complex<double> indata[], double outdata[], std::complex<double> *workspace) const override{
620  backward(indata, workspace);
621  rocm::convert(stream, total_size, indata, outdata);
622  }
623 
625  int box_size() const override{ return total_size; }
627  size_t workspace_size() const override{ return worksize; }
629  size_t compute_workspace_size() const{
630  make_plan(ccomplex_forward);
631  make_plan(ccomplex_backward);
632  make_plan(zcomplex_forward);
633  make_plan(zcomplex_backward);
634  return
635  std::max( std::max(ccomplex_forward->size_work(), ccomplex_backward->size_work()) / sizeof(std::complex<float>),
636  std::max(zcomplex_forward->size_work(), zcomplex_backward->size_work()) / sizeof(std::complex<double>) ) + 1;
637  return 0;
638  }
639 
640 private:
642  template<typename scalar_type, direction dir>
643  void make_plan(std::unique_ptr<plan_rocfft<scalar_type, dir>> &plan) const{
644  if (not plan){
645  if (dist == 0)
646  plan = std::unique_ptr<plan_rocfft<scalar_type, dir>>(new plan_rocfft<scalar_type, dir>(size, size2, howmanyffts));
647  else if (size2 == 0)
648  plan = std::unique_ptr<plan_rocfft<scalar_type, dir>>(new plan_rocfft<scalar_type, dir>(size, howmanyffts, stride, dist));
649  else
650  plan = std::unique_ptr<plan_rocfft<scalar_type, dir>>(new plan_rocfft<scalar_type, dir>(size, size2, embed, howmanyffts, dist));
651  }
652  }
653 
654  mutable hipStream_t stream;
655 
656  int size, size2, howmanyffts, stride, dist, blocks, block_stride, total_size;
657  std::array<size_t, 2> embed;
658  mutable std::unique_ptr<plan_rocfft<std::complex<float>, direction::forward>> ccomplex_forward;
659  mutable std::unique_ptr<plan_rocfft<std::complex<float>, direction::backward>> ccomplex_backward;
660  mutable std::unique_ptr<plan_rocfft<std::complex<double>, direction::forward>> zcomplex_forward;
661  mutable std::unique_ptr<plan_rocfft<std::complex<double>, direction::backward>> zcomplex_backward;
662 
663  size_t worksize;
664 };
665 
675 public:
685  template<typename index>
686  rocfft_executor_r2c(hipStream_t active_stream, box3d<index> const box, int dimension) :
687  stream(active_stream),
688  size(box.size[dimension]),
689  howmanyffts(fft1d_get_howmany(box, dimension)),
690  stride(fft1d_get_stride(box, dimension)),
691  blocks((dimension == box.order[1]) ? box.osize(2) : 1),
692  rdist((dimension == box.order[0]) ? size : 1),
693  cdist((dimension == box.order[0]) ? size/2 + 1 : 1),
694  rblock_stride(box.osize(0) * box.osize(1)),
695  cblock_stride(box.osize(0) * (box.osize(1)/2 + 1)),
696  rsize(box.count()),
697  csize(box.r2c(dimension).count()),
698  worksize(compute_workspace_size())
699  {}
700 
702  template<typename precision_type>
703  void forward(precision_type const indata[], std::complex<precision_type> outdata[], std::complex<precision_type> *workspace) const{
704  if (std::is_same<precision_type, float>::value){
705  make_plan(sforward);
706  }else{
707  make_plan(dforward);
708  }
709 
710  rocfft_execution_info info;
711  rocfft_execution_info_create(&info);
712  rocfft_execution_info_set_stream(info, stream);
713 
714  size_t wsize = (std::is_same<precision_type, float>::value) ? sforward->size_work() : dforward->size_work();
715  if (wsize > 0)
716  rocfft_execution_info_set_work_buffer(info, reinterpret_cast<void*>(workspace), wsize);
717 
718  precision_type *copy_indata = reinterpret_cast<precision_type*>(
719  reinterpret_cast<unsigned char *>(workspace) + wsize);
720  backend::data_manipulator<tag::gpu>::copy_n(stream, indata, box_size(), copy_indata);
721 
722  for(int i=0; i<blocks; i++){
723  void *rdata = const_cast<void*>(reinterpret_cast<void const*>(copy_indata + i * rblock_stride));
724  void *cdata = reinterpret_cast<void*>(outdata + i * cblock_stride);
725  rocm::check_error( rocfft_execute(
726  (std::is_same<precision_type, float>::value) ? *sforward : *dforward,
727  &rdata, &cdata, info), "rocfft execute");
728  }
729  rocfft_execution_info_destroy(info);
730  }
732  template<typename precision_type>
733  void backward(std::complex<precision_type> indata[], precision_type outdata[], std::complex<precision_type> *workspace) const{
734  if (std::is_same<precision_type, float>::value){
735  make_plan(sbackward);
736  }else{
737  make_plan(dbackward);
738  }
739 
740  rocfft_execution_info info;
741  rocfft_execution_info_create(&info);
742  rocfft_execution_info_set_stream(info, stream);
743 
744  size_t wsize = (std::is_same<precision_type, float>::value) ? sbackward->size_work() : dbackward->size_work();
745  if (wsize > 0)
746  rocfft_execution_info_set_work_buffer(info, reinterpret_cast<void*>(workspace), wsize);
747 
748  std::complex<precision_type> *copy_indata = reinterpret_cast<std::complex<precision_type>*>(
749  reinterpret_cast<unsigned char *>(workspace) + wsize);
750  backend::data_manipulator<tag::gpu>::copy_n(stream, indata, complex_size(), copy_indata);
751 
752  for(int i=0; i<blocks; i++){
753  void *cdata = const_cast<void*>(reinterpret_cast<void const*>(copy_indata + i * cblock_stride));
754  void *rdata = reinterpret_cast<void*>(outdata + i * rblock_stride);
755  rocm::check_error( rocfft_execute(
756  (std::is_same<precision_type, float>::value) ? *sbackward : *dbackward,
757  &cdata, &rdata, info), "rocfft execute");
758  }
759  rocfft_execution_info_destroy(info);
760  }
762  void forward(float const indata[], std::complex<float> outdata[], std::complex<float> *workspace) const override{
763  forward<float>(indata, outdata, workspace);
764  }
766  void backward(std::complex<float> indata[], float outdata[], std::complex<float> *workspace) const override{
767  backward<float>(indata, outdata, workspace);
768  }
770  void forward(double const indata[], std::complex<double> outdata[], std::complex<double> *workspace) const override{
771  forward<double>(indata, outdata, workspace);
772  }
774  void backward(std::complex<double> indata[], double outdata[], std::complex<double> *workspace) const override{
775  backward<double>(indata, outdata, workspace);
776  }
777 
779  int box_size() const override{ return rsize; }
781  int complex_size() const override{ return csize; }
783  size_t workspace_size() const override{ return worksize; }
785  size_t compute_workspace_size() const{
786  make_plan(sforward);
787  make_plan(dforward);
788  make_plan(sbackward);
789  make_plan(dbackward);
790  // Temporary copies have to be made, request that from user in addition, to what rocFFT requires.
791  return
792  std::max( std::max(sforward->size_work() + box_size() * sizeof(float), sbackward->size_work() + complex_size() * sizeof(std::complex<float>)) / sizeof(std::complex<float>),
793  std::max(dforward->size_work() + box_size() * sizeof(double), dbackward->size_work() + complex_size() * sizeof(std::complex<double>)) / sizeof(std::complex<double>) ) + 1;
794  }
795 
796 private:
798  template<typename scalar_type, direction dir>
799  void make_plan(std::unique_ptr<plan_rocfft<scalar_type, dir>> &plan) const{
800  if (!plan) plan = std::unique_ptr<plan_rocfft<scalar_type, dir>>(new plan_rocfft<scalar_type, dir>(size, howmanyffts, stride, rdist, cdist));
801  }
802 
803  mutable hipStream_t stream;
804 
805  int size, howmanyffts, stride, blocks;
806  int rdist, cdist, rblock_stride, cblock_stride, rsize, csize;
807  mutable std::unique_ptr<plan_rocfft<float, direction::forward>> sforward;
808  mutable std::unique_ptr<plan_rocfft<double, direction::forward>> dforward;
809  mutable std::unique_ptr<plan_rocfft<float, direction::backward>> sbackward;
810  mutable std::unique_ptr<plan_rocfft<double, direction::backward>> dbackward;
811 
812  size_t worksize;
813 };
814 
821 template<> struct one_dim_backend<backend::rocfft>{
826 };
827 
834 template<> struct one_dim_backend<backend::rocfft_cos>{
838  using executor_r2c = void;
839 };
846 template<> struct one_dim_backend<backend::rocfft_sin>{
850  using executor_r2c = void;
851 };
852 
857 template<> struct direct_packer<tag::gpu>{
859  template<typename scalar_type, typename index>
860  void pack(hipStream_t stream, pack_plan_3d<index> const &plan, scalar_type const data[], scalar_type buffer[]) const{
861  rocm::direct_pack(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride, data, buffer);
862  }
864  template<typename scalar_type, typename index>
865  void unpack(hipStream_t stream, pack_plan_3d<index> const &plan, scalar_type const buffer[], scalar_type data[]) const{
866  rocm::direct_unpack(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride, buffer, data);
867  }
868 };
869 
874 template<> struct transpose_packer<tag::gpu>{
876  template<typename scalar_type, typename index>
877  void pack(hipStream_t stream, pack_plan_3d<index> const &plan, scalar_type const data[], scalar_type buffer[]) const{
878  direct_packer<tag::gpu>().pack(stream, plan, data, buffer); // packing is done the same way as the direct_packer
879  }
881  template<typename scalar_type, typename index>
882  void unpack(hipStream_t stream, pack_plan_3d<index> const &plan, scalar_type const buffer[], scalar_type data[]) const{
883  rocm::transpose_unpack<scalar_type>(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride,
884  plan.buff_line_stride, plan.buff_plane_stride, plan.map[0], plan.map[1], plan.map[2], buffer, data);
885  }
886 };
887 
888 namespace data_scaling {
893  template<typename scalar_type, typename index>
894  static void apply(hipStream_t stream, index num_entries, scalar_type *data, double scale_factor){
895  rocm::scale_data<scalar_type, long long>(stream, static_cast<long long>(num_entries), data, scale_factor);
896  }
901  template<typename precision_type, typename index>
902  static void apply(hipStream_t stream, index num_entries, std::complex<precision_type> *data, double scale_factor){
903  apply<precision_type>(stream, 2*num_entries, reinterpret_cast<precision_type*>(data), scale_factor);
904  }
905 };
906 
911 template<> struct default_plan_options<backend::rocfft>{
913  static const bool use_reorder = true;
914 };
919 template<> struct default_plan_options<backend::rocfft_cos>{
921  static const bool use_reorder = true;
922 };
927 template<> struct default_plan_options<backend::rocfft_sin>{
929  static const bool use_reorder = true;
930 };
931 
932 }
933 
934 #endif
935 
936 #endif /* HEFFTE_BACKEND_FFTW_H */
Base class for all backend executors.
Definition: heffte_common.h:486
virtual int complex_size() const
Return the size of the complex-box (r2c executors).
Definition: heffte_common.h:519
virtual void backward(float[], float *) const
Backward r2r, single precision.
Definition: heffte_common.h:495
virtual void forward(float[], float *) const
Forward r2r, single precision.
Definition: heffte_common.h:491
Wrapper to rocFFT API for real-to-complex transform with shortening of the data.
Definition: heffte_backend_rocm.h:674
int complex_size() const override
Returns the size of the box with complex coefficients.
Definition: heffte_backend_rocm.h:781
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *workspace) const override
Backward transform, single precision.
Definition: heffte_backend_rocm.h:766
rocfft_executor_r2c(hipStream_t active_stream, box3d< index > const box, int dimension)
Constructor defines the box and the dimension of reduction.
Definition: heffte_backend_rocm.h:686
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *workspace) const override
Forward transform, single precision.
Definition: heffte_backend_rocm.h:762
int box_size() const override
Returns the size of the box with real data.
Definition: heffte_backend_rocm.h:779
void forward(precision_type const indata[], std::complex< precision_type > outdata[], std::complex< precision_type > *workspace) const
Forward transform, single precision.
Definition: heffte_backend_rocm.h:703
void backward(std::complex< precision_type > indata[], precision_type outdata[], std::complex< precision_type > *workspace) const
Backward transform, single precision.
Definition: heffte_backend_rocm.h:733
size_t compute_workspace_size() const
Computes the size of the needed workspace.
Definition: heffte_backend_rocm.h:785
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *workspace) const override
Backward transform, double precision.
Definition: heffte_backend_rocm.h:774
size_t workspace_size() const override
Return the size of the needed workspace.
Definition: heffte_backend_rocm.h:783
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *workspace) const override
Forward transform, double precision.
Definition: heffte_backend_rocm.h:770
Wrapper around the rocFFT API.
Definition: heffte_backend_rocm.h:487
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *workspace) const override
Converts the deal data to complex and performs double-complex forward transform.
Definition: heffte_backend_rocm.h:609
virtual void backward(float[], float *) const
Bring forth method that have not been overloaded.
Definition: heffte_common.h:495
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *workspace) const override
Performs backward float-complex transform and truncates the complex part of the result.
Definition: heffte_backend_rocm.h:614
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *workspace) const override
Converts the deal data to complex and performs float-complex forward transform.
Definition: heffte_backend_rocm.h:604
virtual void forward(float[], float *) const
Bring forth method that have not been overloaded.
Definition: heffte_common.h:491
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *workspace) const override
Performs backward double-complex transform and truncates the complex part of the result.
Definition: heffte_backend_rocm.h:619
rocfft_executor(hipStream_t active_stream, box3d< index > const box, int dimension)
Constructor, specifies the box and dimension.
Definition: heffte_backend_rocm.h:497
size_t compute_workspace_size() const
Computes the size of the needed workspace.
Definition: heffte_backend_rocm.h:629
rocfft_executor(hipStream_t active_stream, box3d< index > const box, int dir1, int dir2)
Merges two FFTs into one.
Definition: heffte_backend_rocm.h:511
int box_size() const override
Returns the size of the box.
Definition: heffte_backend_rocm.h:625
void forward(std::complex< double > data[], std::complex< double > *workspace) const override
Forward fft, double-complex case.
Definition: heffte_backend_rocm.h:591
void backward(std::complex< float > data[], std::complex< float > *workspace) const override
Backward fft, float-complex case.
Definition: heffte_backend_rocm.h:595
size_t workspace_size() const override
Return the size of the needed workspace.
Definition: heffte_backend_rocm.h:627
void execute(std::complex< precision_type > data[], std::complex< precision_type > *workspace) const
Perform an in-place FFT on the data in the given direction.
Definition: heffte_backend_rocm.h:552
rocfft_executor(hipStream_t active_stream, box3d< index > const box)
Merges three FFTs into one.
Definition: heffte_backend_rocm.h:540
void backward(std::complex< double > data[], std::complex< double > *workspace) const override
Backward fft, double-complex case.
Definition: heffte_backend_rocm.h:599
void forward(std::complex< float > data[], std::complex< float > *workspace) const override
Forward fft, float-complex case.
Definition: heffte_backend_rocm.h:587
int fft1d_get_howmany(box3d< index > const box, int const dimension)
Return the number of 1-D ffts contained in the box in the given dimension.
Definition: heffte_geometry.h:159
int fft1d_get_stride(box3d< index > const box, int const dimension)
Return the stride of the 1-D ffts contained in the box in the given dimension.
Definition: heffte_geometry.h:169
@ backward
Inverse DFT transform.
@ forward
Forward DFT transform.
void apply(cudaStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Simply multiply the num_entries in the data by the scale_factor.
Definition: heffte_backend_cuda.h:796
void check_error(hipError_t status, const char *function_name)
Checks the status of a ROCm command and in case of a failure, converts it to a C++ exception.
Definition: heffte_backend_rocm.h:51
void convert(hipStream_t stream, index num_entries, precision_type const source[], std::complex< precision_type > destination[])
Convert real numbers to complex when both are located on the GPU device.
void transpose_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, index buff_line_stride, index buff_plane_stride, int map0, int map1, int map2, scalar_type const source[], scalar_type destination[])
Performs a transpose-unpack operation for data sitting on the GPU device.
void scale_data(hipStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Scales real data (double or float) by the scaling factor.
void direct_pack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-pack operation for data sitting on the GPU device.
void direct_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-unpack operation for data sitting on the GPU device.
Namespace containing all HeFFTe methods and classes.
Definition: heffte_backend_cuda.h:38
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the ROCm vector container.
Definition: heffte_backend_rocm.h:272
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the ROCm vector container.
Definition: heffte_backend_rocm.h:283
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the ROCm vector container.
Definition: heffte_backend_rocm.h:294
Defines the container for the temporary buffers.
Definition: heffte_common.h:212
Type-tag for the cuFFT backend.
Definition: heffte_common.h:147
static void copy_device_to_device(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the device.
Definition: heffte_backend_rocm.h:251
cudaStream_t stream_type
The stream type for the device.
Definition: heffte_backend_cuda.h:202
static void copy_n(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Equivalent to std::copy_n() but using CUDA arrays.
Definition: heffte_backend_rocm.h:227
static void copy_n(hipStream_t stream, scalar_type const source[], size_t num_entries, std::complex< scalar_type > destination[])
Copy-convert real-to-complex.
Definition: heffte_backend_rocm.h:240
static void copy_host_to_device(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the host to the device.
Definition: heffte_backend_rocm.h:257
static void copy_device_to_host(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the host.
Definition: heffte_backend_rocm.h:245
static scalar_type * allocate(hipStream_t stream, size_t num_entries)
Allocate memory.
Definition: heffte_backend_rocm.h:212
static void free(hipStream_t stream, scalar_type *pntr)
Free memory.
Definition: heffte_backend_rocm.h:220
static void copy_n(hipStream_t stream, std::complex< scalar_type > const source[], size_t num_entries, scalar_type destination[])
Copy-convert complex-to-real.
Definition: heffte_backend_rocm.h:235
Common data-transfer operations, must be specializes for each location (cpu/gpu).
Definition: heffte_common.h:59
Defines inverse mapping from the location tag to a default backend tag.
Definition: heffte_common.h:380
The CUDA backend uses a CUDA stream.
Definition: heffte_backend_cuda.h:172
device_instance(hipStream_t new_stream=nullptr)
Constructor, sets up the stream.
Definition: heffte_backend_rocm.h:179
void synchronize_device() const
Syncs the execution with the queue.
Definition: heffte_backend_rocm.h:185
hipStream_t stream() const
Returns the nullptr (const case).
Definition: heffte_backend_rocm.h:183
cudaStream_t stream_type
The type for the internal stream.
Definition: heffte_backend_cuda.h:184
hipStream_t stream()
Returns the nullptr.
Definition: heffte_backend_rocm.h:181
hipStream_t _stream
The CUDA stream to be used in all operations.
Definition: heffte_backend_rocm.h:187
Holds the auxiliary variables needed by each backend.
Definition: heffte_common.h:358
Allows to define whether a specific backend interface has been enabled.
Definition: heffte_common.h:201
Type-tag for the Cosine Transform using the rocFFT backend.
Definition: heffte_common.h:169
Type-tag for the Sine Transform using the rocFFT backend.
Definition: heffte_common.h:174
Type-tag for the rocFFT backend.
Definition: heffte_common.h:164
A generic container that describes a 3d box of indexes.
Definition: heffte_geometry.h:67
std::array< index, 3 > const size
The number of indexes in each direction.
Definition: heffte_geometry.h:129
int find_order(int dir) const
Returns the effective order of the direction (dir), 0 -> fast, 1 -> mid, 2 -> slow.
Definition: heffte_geometry.h:121
Defines a set of default plan options for a given backend.
Definition: heffte_common.h:642
Simple packer that copies sub-boxes without transposing the order of the indexes.
Definition: heffte_backend_cuda.h:759
void unpack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned unpack operation.
Definition: heffte_backend_rocm.h:865
void pack(cudaStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition: heffte_backend_cuda.h:762
void pack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition: heffte_backend_rocm.h:860
Defines the direct packer without implementation, use the specializations to get the CPU or GPU imple...
Definition: heffte_pack3d.h:83
void executor_r2c
Defines the real-to-complex executor.
Definition: heffte_backend_rocm.h:838
void executor_r2c
Defines the real-to-complex executor.
Definition: heffte_backend_rocm.h:850
Indicates the structure that will be used by the fft backend.
Definition: heffte_common.h:546
Holds the plan for a pack/unpack operation.
Definition: heffte_pack3d.h:32
index buff_plane_stride
Stride of the planes in the received buffer (transpose packing only).
Definition: heffte_pack3d.h:42
index line_stride
Stride of the lines.
Definition: heffte_pack3d.h:36
index plane_stride
Stride of the planes.
Definition: heffte_pack3d.h:38
std::array< index, 3 > size
Number of elements in the three directions.
Definition: heffte_pack3d.h:34
std::array< int, 3 > map
Maps the i,j,k indexes from input to the output (transpose packing only).
Definition: heffte_pack3d.h:44
index buff_line_stride
Stride of the lines in the received buffer (transpose packing only).
Definition: heffte_pack3d.h:40
~plan_rocfft()
Destructor, deletes the plan.
Definition: heffte_backend_rocm.h:463
plan_rocfft(size_t size1, size_t size2, std::array< size_t, 2 > const &embed, size_t batch, size_t dist)
Constructor, takes inputs identical to cufftMakePlanMany().
Definition: heffte_backend_rocm.h:407
plan_rocfft(size_t size, size_t batch, size_t stride, size_t dist)
Constructor, takes inputs identical to cufftMakePlanMany().
Definition: heffte_backend_rocm.h:372
plan_rocfft(size_t size1, size_t size2, size_t size3)
Constructor, takes inputs identical to cufftPlan3d()
Definition: heffte_backend_rocm.h:436
size_t size_work() const
Return the worksize.
Definition: heffte_backend_rocm.h:467
Plan for the r2c single precision transform.
Definition: heffte_backend_rocm.h:306
~plan_rocfft()
Destructor, deletes the plan.
Definition: heffte_backend_rocm.h:345
size_t size_work() const
Return the worksize.
Definition: heffte_backend_rocm.h:349
plan_rocfft(size_t size, size_t batch, size_t stride, size_t rdist, size_t cdist)
Constructor and initializer of the plan.
Definition: heffte_backend_rocm.h:316
Template algorithm for the Sine and Cosine transforms.
Definition: heffte_r2r_executor.h:135
Implementation of Cosine Transform pre-post processing methods using CUDA.
Definition: heffte_backend_rocm.h:121
static void post_forward(hipStream_t, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
static void pre_backward(hipStream_t, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
Implementation of Sine Transform pre-post processing methods using CUDA.
Definition: heffte_backend_rocm.h:139
static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
static void pre_backward(hipStream_t, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
static void post_forward(hipStream_t, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
Indicates the use of gpu backend and that all input/output data and arrays will be bound to the gpu d...
Definition: heffte_common.h:45
void pack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition: heffte_backend_rocm.h:877
void unpack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned transpose-unpack operation.
Definition: heffte_backend_rocm.h:882